diff --git a/.nojekyll b/.nojekyll
new file mode 100644
index 00000000..e69de29b
diff --git a/404.html b/404.html
new file mode 100644
index 00000000..f6eae60a
--- /dev/null
+++ b/404.html
@@ -0,0 +1,184 @@
+<!DOCTYPE html>
+<html class="writer-html5" lang="en" >
+<head>
+    <meta charset="utf-8" />
+    <meta http-equiv="X-UA-Compatible" content="IE=edge" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+      <link rel="shortcut icon" href="/unofficial-loongarch-intrinsics-guide/img/favicon.ico" />
+    <title>Unofficial LoongArch Intrinsics Guide</title>
+    <link rel="stylesheet" href="/unofficial-loongarch-intrinsics-guide/css/theme.css" />
+    <link rel="stylesheet" href="/unofficial-loongarch-intrinsics-guide/css/theme_extra.css" />
+        <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.8.0/styles/github.min.css" />
+        <link href="/unofficial-loongarch-intrinsics-guide/main.css" rel="stylesheet" />
+    
+    <!--[if lt IE 9]>
+      <script src="/unofficial-loongarch-intrinsics-guide/js/html5shiv.min.js"></script>
+    <![endif]-->
+      <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.8.0/highlight.min.js"></script>
+      <script>hljs.highlightAll();</script> 
+</head>
+
+<body class="wy-body-for-nav" role="document">
+
+  <div class="wy-grid-for-nav">
+    <nav data-toggle="wy-nav-shift" class="wy-nav-side stickynav">
+    <div class="wy-side-scroll">
+      <div class="wy-side-nav-search">
+          <a href="/unofficial-loongarch-intrinsics-guide/." class="icon icon-home"> Unofficial LoongArch Intrinsics Guide
+        </a>
+      </div>
+
+      <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="/unofficial-loongarch-intrinsics-guide/.">Unofficial LoongArch Intrinsics Guide</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="/unofficial-loongarch-intrinsics-guide/latency_throughput/">Latency and Throughput of Instructions</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="/unofficial-loongarch-intrinsics-guide/migrating_avx/">Migrating from AVX to LASX</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="/unofficial-loongarch-intrinsics-guide/migrating_sse/">Migrating from SSE to LSX</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="/unofficial-loongarch-intrinsics-guide/viewer/">Browse All Intrinsics</a>
+                </li>
+              </ul>
+              <p class="caption"><span class="caption-text">Lasx</span></p>
+              <ul>
+                  <li class="toctree-l1"><a class="reference internal" href="/unofficial-loongarch-intrinsics-guide/lasx/bitwise_operations/">Bitwise Operations</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="/unofficial-loongarch-intrinsics-guide/lasx/branch/">Branch</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="/unofficial-loongarch-intrinsics-guide/lasx/float_comparison/">Floating Point Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="/unofficial-loongarch-intrinsics-guide/lasx/float_computation/">Floating Point Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="/unofficial-loongarch-intrinsics-guide/lasx/float_conversion/">Floating Point Conversion</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="/unofficial-loongarch-intrinsics-guide/lasx/float_misc/">Floating Point Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="/unofficial-loongarch-intrinsics-guide/lasx/fma/">Fused Multiply-Add</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="/unofficial-loongarch-intrinsics-guide/lasx/integer_comparison/">Integer Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="/unofficial-loongarch-intrinsics-guide/lasx/integer_computation/">Integer Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="/unofficial-loongarch-intrinsics-guide/lasx/logical/">Logical</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="/unofficial-loongarch-intrinsics-guide/lasx/memory/">Memory Load & Store</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="/unofficial-loongarch-intrinsics-guide/lasx/misc/">Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="/unofficial-loongarch-intrinsics-guide/lasx/permutation/">Permutation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="/unofficial-loongarch-intrinsics-guide/lasx/shift/">Shift</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="/unofficial-loongarch-intrinsics-guide/lasx/shuffling/">Shuffling</a>
+                  </li>
+              </ul>
+              <p class="caption"><span class="caption-text">Lsx</span></p>
+              <ul>
+                  <li class="toctree-l1"><a class="reference internal" href="/unofficial-loongarch-intrinsics-guide/lsx/bitwise_operations/">Bitwise Operations</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="/unofficial-loongarch-intrinsics-guide/lsx/branch/">Branch</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="/unofficial-loongarch-intrinsics-guide/lsx/float_comparison/">Floating Point Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="/unofficial-loongarch-intrinsics-guide/lsx/float_computation/">Floating Point Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="/unofficial-loongarch-intrinsics-guide/lsx/float_conversion/">Floating Point Conversion</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="/unofficial-loongarch-intrinsics-guide/lsx/float_misc/">Floating Point Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="/unofficial-loongarch-intrinsics-guide/lsx/fma/">Fused Multiply-Add</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="/unofficial-loongarch-intrinsics-guide/lsx/integer_comparison/">Integer Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="/unofficial-loongarch-intrinsics-guide/lsx/integer_computation/">Integer Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="/unofficial-loongarch-intrinsics-guide/lsx/logical/">Logical</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="/unofficial-loongarch-intrinsics-guide/lsx/memory/">Memory Load & Store</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="/unofficial-loongarch-intrinsics-guide/lsx/misc/">Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="/unofficial-loongarch-intrinsics-guide/lsx/permutation/">Permutation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="/unofficial-loongarch-intrinsics-guide/lsx/shift/">Shift</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="/unofficial-loongarch-intrinsics-guide/lsx/shuffling/">Shuffling</a>
+                  </li>
+              </ul>
+      </div>
+    </div>
+    </nav>
+
+    <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
+      <nav class="wy-nav-top" role="navigation" aria-label="Mobile navigation menu">
+          <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
+          <a href="/unofficial-loongarch-intrinsics-guide/.">Unofficial LoongArch Intrinsics Guide</a>
+        
+      </nav>
+      <div class="wy-nav-content">
+        <div class="rst-content"><div role="navigation" aria-label="breadcrumbs navigation">
+  <ul class="wy-breadcrumbs">
+    <li><a href="/unofficial-loongarch-intrinsics-guide/." class="icon icon-home" aria-label="Docs"></a></li>
+    <li class="wy-breadcrumbs-aside">
+    </li>
+  </ul>
+  <hr/>
+</div>
+          <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
+            <div class="section" itemprop="articleBody">
+              
+
+  <h1 id="404-page-not-found">404</h1>
+
+  <p><strong>Page not found</strong></p>
+
+
+            </div>
+          </div><footer>
+
+  <hr/>
+
+  <div role="contentinfo">
+    <!-- Copyright etc -->
+      <p>Copyright &copy; 2023 Jiajie Chen</p>
+  </div>
+
+  Built with <a href="https://www.mkdocs.org/">MkDocs</a> using a <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
+</footer>
+          
+        </div>
+      </div>
+
+    </section>
+
+  </div>
+
+  <div class="rst-versions" role="note" aria-label="Versions">
+  <span class="rst-current-version" data-toggle="rst-current-version">
+    
+    
+    
+  </span>
+</div>
+    <script src="/unofficial-loongarch-intrinsics-guide/js/jquery-3.6.0.min.js"></script>
+    <script>var base_url = "/unofficial-loongarch-intrinsics-guide/";</script>
+    <script src="/unofficial-loongarch-intrinsics-guide/js/theme_extra.js"></script>
+    <script src="/unofficial-loongarch-intrinsics-guide/js/theme.js"></script>
+    <script>
+        jQuery(function () {
+            SphinxRtdTheme.Navigation.enable(true);
+        });
+    </script>
+
+</body>
+</html>
diff --git a/css/fonts/Roboto-Slab-Bold.woff b/css/fonts/Roboto-Slab-Bold.woff
new file mode 100644
index 00000000..6cb60000
Binary files /dev/null and b/css/fonts/Roboto-Slab-Bold.woff differ
diff --git a/css/fonts/Roboto-Slab-Bold.woff2 b/css/fonts/Roboto-Slab-Bold.woff2
new file mode 100644
index 00000000..7059e231
Binary files /dev/null and b/css/fonts/Roboto-Slab-Bold.woff2 differ
diff --git a/css/fonts/Roboto-Slab-Regular.woff b/css/fonts/Roboto-Slab-Regular.woff
new file mode 100644
index 00000000..f815f63f
Binary files /dev/null and b/css/fonts/Roboto-Slab-Regular.woff differ
diff --git a/css/fonts/Roboto-Slab-Regular.woff2 b/css/fonts/Roboto-Slab-Regular.woff2
new file mode 100644
index 00000000..f2c76e5b
Binary files /dev/null and b/css/fonts/Roboto-Slab-Regular.woff2 differ
diff --git a/css/fonts/fontawesome-webfont.eot b/css/fonts/fontawesome-webfont.eot
new file mode 100644
index 00000000..e9f60ca9
Binary files /dev/null and b/css/fonts/fontawesome-webfont.eot differ
diff --git a/css/fonts/fontawesome-webfont.svg b/css/fonts/fontawesome-webfont.svg
new file mode 100644
index 00000000..855c845e
--- /dev/null
+++ b/css/fonts/fontawesome-webfont.svg
@@ -0,0 +1,2671 @@
+<?xml version="1.0" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd" >
+<svg>
+<metadata>
+Created by FontForge 20120731 at Mon Oct 24 17:37:40 2016
+ By ,,,
+Copyright Dave Gandy 2016. All rights reserved.
+</metadata>
+<defs>
+<font id="FontAwesome" horiz-adv-x="1536" >
+  <font-face 
+    font-family="FontAwesome"
+    font-weight="400"
+    font-stretch="normal"
+    units-per-em="1792"
+    panose-1="0 0 0 0 0 0 0 0 0 0"
+    ascent="1536"
+    descent="-256"
+    bbox="-1.02083 -256.962 2304.6 1537.02"
+    underline-thickness="0"
+    underline-position="0"
+    unicode-range="U+0020-F500"
+  />
+<missing-glyph horiz-adv-x="896" 
+d="M224 112h448v1312h-448v-1312zM112 0v1536h672v-1536h-672z" />
+    <glyph glyph-name=".notdef" horiz-adv-x="896" 
+d="M224 112h448v1312h-448v-1312zM112 0v1536h672v-1536h-672z" />
+    <glyph glyph-name=".null" horiz-adv-x="0" 
+ />
+    <glyph glyph-name="nonmarkingreturn" horiz-adv-x="597" 
+ />
+    <glyph glyph-name="space" unicode=" " horiz-adv-x="448" 
+ />
+    <glyph glyph-name="dieresis" unicode="&#xa8;" horiz-adv-x="1792" 
+ />
+    <glyph glyph-name="copyright" unicode="&#xa9;" horiz-adv-x="1792" 
+ />
+    <glyph glyph-name="registered" unicode="&#xae;" horiz-adv-x="1792" 
+ />
+    <glyph glyph-name="acute" unicode="&#xb4;" horiz-adv-x="1792" 
+ />
+    <glyph glyph-name="AE" unicode="&#xc6;" horiz-adv-x="1792" 
+ />
+    <glyph glyph-name="Oslash" unicode="&#xd8;" horiz-adv-x="1792" 
+ />
+    <glyph glyph-name="trademark" unicode="&#x2122;" horiz-adv-x="1792" 
+ />
+    <glyph glyph-name="infinity" unicode="&#x221e;" horiz-adv-x="1792" 
+ />
+    <glyph glyph-name="notequal" unicode="&#x2260;" horiz-adv-x="1792" 
+ />
+    <glyph glyph-name="glass" unicode="&#xf000;" horiz-adv-x="1792" 
+d="M1699 1350q0 -35 -43 -78l-632 -632v-768h320q26 0 45 -19t19 -45t-19 -45t-45 -19h-896q-26 0 -45 19t-19 45t19 45t45 19h320v768l-632 632q-43 43 -43 78q0 23 18 36.5t38 17.5t43 4h1408q23 0 43 -4t38 -17.5t18 -36.5z" />
+    <glyph glyph-name="music" unicode="&#xf001;" 
+d="M1536 1312v-1120q0 -50 -34 -89t-86 -60.5t-103.5 -32t-96.5 -10.5t-96.5 10.5t-103.5 32t-86 60.5t-34 89t34 89t86 60.5t103.5 32t96.5 10.5q105 0 192 -39v537l-768 -237v-709q0 -50 -34 -89t-86 -60.5t-103.5 -32t-96.5 -10.5t-96.5 10.5t-103.5 32t-86 60.5t-34 89
+t34 89t86 60.5t103.5 32t96.5 10.5q105 0 192 -39v967q0 31 19 56.5t49 35.5l832 256q12 4 28 4q40 0 68 -28t28 -68z" />
+    <glyph glyph-name="search" unicode="&#xf002;" horiz-adv-x="1664" 
+d="M1152 704q0 185 -131.5 316.5t-316.5 131.5t-316.5 -131.5t-131.5 -316.5t131.5 -316.5t316.5 -131.5t316.5 131.5t131.5 316.5zM1664 -128q0 -52 -38 -90t-90 -38q-54 0 -90 38l-343 342q-179 -124 -399 -124q-143 0 -273.5 55.5t-225 150t-150 225t-55.5 273.5
+t55.5 273.5t150 225t225 150t273.5 55.5t273.5 -55.5t225 -150t150 -225t55.5 -273.5q0 -220 -124 -399l343 -343q37 -37 37 -90z" />
+    <glyph glyph-name="envelope" unicode="&#xf003;" horiz-adv-x="1792" 
+d="M1664 32v768q-32 -36 -69 -66q-268 -206 -426 -338q-51 -43 -83 -67t-86.5 -48.5t-102.5 -24.5h-1h-1q-48 0 -102.5 24.5t-86.5 48.5t-83 67q-158 132 -426 338q-37 30 -69 66v-768q0 -13 9.5 -22.5t22.5 -9.5h1472q13 0 22.5 9.5t9.5 22.5zM1664 1083v11v13.5t-0.5 13
+t-3 12.5t-5.5 9t-9 7.5t-14 2.5h-1472q-13 0 -22.5 -9.5t-9.5 -22.5q0 -168 147 -284q193 -152 401 -317q6 -5 35 -29.5t46 -37.5t44.5 -31.5t50.5 -27.5t43 -9h1h1q20 0 43 9t50.5 27.5t44.5 31.5t46 37.5t35 29.5q208 165 401 317q54 43 100.5 115.5t46.5 131.5z
+M1792 1120v-1088q0 -66 -47 -113t-113 -47h-1472q-66 0 -113 47t-47 113v1088q0 66 47 113t113 47h1472q66 0 113 -47t47 -113z" />
+    <glyph glyph-name="heart" unicode="&#xf004;" horiz-adv-x="1792" 
+d="M896 -128q-26 0 -44 18l-624 602q-10 8 -27.5 26t-55.5 65.5t-68 97.5t-53.5 121t-23.5 138q0 220 127 344t351 124q62 0 126.5 -21.5t120 -58t95.5 -68.5t76 -68q36 36 76 68t95.5 68.5t120 58t126.5 21.5q224 0 351 -124t127 -344q0 -221 -229 -450l-623 -600
+q-18 -18 -44 -18z" />
+    <glyph glyph-name="star" unicode="&#xf005;" horiz-adv-x="1664" 
+d="M1664 889q0 -22 -26 -48l-363 -354l86 -500q1 -7 1 -20q0 -21 -10.5 -35.5t-30.5 -14.5q-19 0 -40 12l-449 236l-449 -236q-22 -12 -40 -12q-21 0 -31.5 14.5t-10.5 35.5q0 6 2 20l86 500l-364 354q-25 27 -25 48q0 37 56 46l502 73l225 455q19 41 49 41t49 -41l225 -455
+l502 -73q56 -9 56 -46z" />
+    <glyph glyph-name="star_empty" unicode="&#xf006;" horiz-adv-x="1664" 
+d="M1137 532l306 297l-422 62l-189 382l-189 -382l-422 -62l306 -297l-73 -421l378 199l377 -199zM1664 889q0 -22 -26 -48l-363 -354l86 -500q1 -7 1 -20q0 -50 -41 -50q-19 0 -40 12l-449 236l-449 -236q-22 -12 -40 -12q-21 0 -31.5 14.5t-10.5 35.5q0 6 2 20l86 500
+l-364 354q-25 27 -25 48q0 37 56 46l502 73l225 455q19 41 49 41t49 -41l225 -455l502 -73q56 -9 56 -46z" />
+    <glyph glyph-name="user" unicode="&#xf007;" horiz-adv-x="1280" 
+d="M1280 137q0 -109 -62.5 -187t-150.5 -78h-854q-88 0 -150.5 78t-62.5 187q0 85 8.5 160.5t31.5 152t58.5 131t94 89t134.5 34.5q131 -128 313 -128t313 128q76 0 134.5 -34.5t94 -89t58.5 -131t31.5 -152t8.5 -160.5zM1024 1024q0 -159 -112.5 -271.5t-271.5 -112.5
+t-271.5 112.5t-112.5 271.5t112.5 271.5t271.5 112.5t271.5 -112.5t112.5 -271.5z" />
+    <glyph glyph-name="film" unicode="&#xf008;" horiz-adv-x="1920" 
+d="M384 -64v128q0 26 -19 45t-45 19h-128q-26 0 -45 -19t-19 -45v-128q0 -26 19 -45t45 -19h128q26 0 45 19t19 45zM384 320v128q0 26 -19 45t-45 19h-128q-26 0 -45 -19t-19 -45v-128q0 -26 19 -45t45 -19h128q26 0 45 19t19 45zM384 704v128q0 26 -19 45t-45 19h-128
+q-26 0 -45 -19t-19 -45v-128q0 -26 19 -45t45 -19h128q26 0 45 19t19 45zM1408 -64v512q0 26 -19 45t-45 19h-768q-26 0 -45 -19t-19 -45v-512q0 -26 19 -45t45 -19h768q26 0 45 19t19 45zM384 1088v128q0 26 -19 45t-45 19h-128q-26 0 -45 -19t-19 -45v-128q0 -26 19 -45
+t45 -19h128q26 0 45 19t19 45zM1792 -64v128q0 26 -19 45t-45 19h-128q-26 0 -45 -19t-19 -45v-128q0 -26 19 -45t45 -19h128q26 0 45 19t19 45zM1408 704v512q0 26 -19 45t-45 19h-768q-26 0 -45 -19t-19 -45v-512q0 -26 19 -45t45 -19h768q26 0 45 19t19 45zM1792 320v128
+q0 26 -19 45t-45 19h-128q-26 0 -45 -19t-19 -45v-128q0 -26 19 -45t45 -19h128q26 0 45 19t19 45zM1792 704v128q0 26 -19 45t-45 19h-128q-26 0 -45 -19t-19 -45v-128q0 -26 19 -45t45 -19h128q26 0 45 19t19 45zM1792 1088v128q0 26 -19 45t-45 19h-128q-26 0 -45 -19
+t-19 -45v-128q0 -26 19 -45t45 -19h128q26 0 45 19t19 45zM1920 1248v-1344q0 -66 -47 -113t-113 -47h-1600q-66 0 -113 47t-47 113v1344q0 66 47 113t113 47h1600q66 0 113 -47t47 -113z" />
+    <glyph glyph-name="th_large" unicode="&#xf009;" horiz-adv-x="1664" 
+d="M768 512v-384q0 -52 -38 -90t-90 -38h-512q-52 0 -90 38t-38 90v384q0 52 38 90t90 38h512q52 0 90 -38t38 -90zM768 1280v-384q0 -52 -38 -90t-90 -38h-512q-52 0 -90 38t-38 90v384q0 52 38 90t90 38h512q52 0 90 -38t38 -90zM1664 512v-384q0 -52 -38 -90t-90 -38
+h-512q-52 0 -90 38t-38 90v384q0 52 38 90t90 38h512q52 0 90 -38t38 -90zM1664 1280v-384q0 -52 -38 -90t-90 -38h-512q-52 0 -90 38t-38 90v384q0 52 38 90t90 38h512q52 0 90 -38t38 -90z" />
+    <glyph glyph-name="th" unicode="&#xf00a;" horiz-adv-x="1792" 
+d="M512 288v-192q0 -40 -28 -68t-68 -28h-320q-40 0 -68 28t-28 68v192q0 40 28 68t68 28h320q40 0 68 -28t28 -68zM512 800v-192q0 -40 -28 -68t-68 -28h-320q-40 0 -68 28t-28 68v192q0 40 28 68t68 28h320q40 0 68 -28t28 -68zM1152 288v-192q0 -40 -28 -68t-68 -28h-320
+q-40 0 -68 28t-28 68v192q0 40 28 68t68 28h320q40 0 68 -28t28 -68zM512 1312v-192q0 -40 -28 -68t-68 -28h-320q-40 0 -68 28t-28 68v192q0 40 28 68t68 28h320q40 0 68 -28t28 -68zM1152 800v-192q0 -40 -28 -68t-68 -28h-320q-40 0 -68 28t-28 68v192q0 40 28 68t68 28
+h320q40 0 68 -28t28 -68zM1792 288v-192q0 -40 -28 -68t-68 -28h-320q-40 0 -68 28t-28 68v192q0 40 28 68t68 28h320q40 0 68 -28t28 -68zM1152 1312v-192q0 -40 -28 -68t-68 -28h-320q-40 0 -68 28t-28 68v192q0 40 28 68t68 28h320q40 0 68 -28t28 -68zM1792 800v-192
+q0 -40 -28 -68t-68 -28h-320q-40 0 -68 28t-28 68v192q0 40 28 68t68 28h320q40 0 68 -28t28 -68zM1792 1312v-192q0 -40 -28 -68t-68 -28h-320q-40 0 -68 28t-28 68v192q0 40 28 68t68 28h320q40 0 68 -28t28 -68z" />
+    <glyph glyph-name="th_list" unicode="&#xf00b;" horiz-adv-x="1792" 
+d="M512 288v-192q0 -40 -28 -68t-68 -28h-320q-40 0 -68 28t-28 68v192q0 40 28 68t68 28h320q40 0 68 -28t28 -68zM512 800v-192q0 -40 -28 -68t-68 -28h-320q-40 0 -68 28t-28 68v192q0 40 28 68t68 28h320q40 0 68 -28t28 -68zM1792 288v-192q0 -40 -28 -68t-68 -28h-960
+q-40 0 -68 28t-28 68v192q0 40 28 68t68 28h960q40 0 68 -28t28 -68zM512 1312v-192q0 -40 -28 -68t-68 -28h-320q-40 0 -68 28t-28 68v192q0 40 28 68t68 28h320q40 0 68 -28t28 -68zM1792 800v-192q0 -40 -28 -68t-68 -28h-960q-40 0 -68 28t-28 68v192q0 40 28 68t68 28
+h960q40 0 68 -28t28 -68zM1792 1312v-192q0 -40 -28 -68t-68 -28h-960q-40 0 -68 28t-28 68v192q0 40 28 68t68 28h960q40 0 68 -28t28 -68z" />
+    <glyph glyph-name="ok" unicode="&#xf00c;" horiz-adv-x="1792" 
+d="M1671 970q0 -40 -28 -68l-724 -724l-136 -136q-28 -28 -68 -28t-68 28l-136 136l-362 362q-28 28 -28 68t28 68l136 136q28 28 68 28t68 -28l294 -295l656 657q28 28 68 28t68 -28l136 -136q28 -28 28 -68z" />
+    <glyph glyph-name="remove" unicode="&#xf00d;" horiz-adv-x="1408" 
+d="M1298 214q0 -40 -28 -68l-136 -136q-28 -28 -68 -28t-68 28l-294 294l-294 -294q-28 -28 -68 -28t-68 28l-136 136q-28 28 -28 68t28 68l294 294l-294 294q-28 28 -28 68t28 68l136 136q28 28 68 28t68 -28l294 -294l294 294q28 28 68 28t68 -28l136 -136q28 -28 28 -68
+t-28 -68l-294 -294l294 -294q28 -28 28 -68z" />
+    <glyph glyph-name="zoom_in" unicode="&#xf00e;" horiz-adv-x="1664" 
+d="M1024 736v-64q0 -13 -9.5 -22.5t-22.5 -9.5h-224v-224q0 -13 -9.5 -22.5t-22.5 -9.5h-64q-13 0 -22.5 9.5t-9.5 22.5v224h-224q-13 0 -22.5 9.5t-9.5 22.5v64q0 13 9.5 22.5t22.5 9.5h224v224q0 13 9.5 22.5t22.5 9.5h64q13 0 22.5 -9.5t9.5 -22.5v-224h224
+q13 0 22.5 -9.5t9.5 -22.5zM1152 704q0 185 -131.5 316.5t-316.5 131.5t-316.5 -131.5t-131.5 -316.5t131.5 -316.5t316.5 -131.5t316.5 131.5t131.5 316.5zM1664 -128q0 -53 -37.5 -90.5t-90.5 -37.5q-54 0 -90 38l-343 342q-179 -124 -399 -124q-143 0 -273.5 55.5
+t-225 150t-150 225t-55.5 273.5t55.5 273.5t150 225t225 150t273.5 55.5t273.5 -55.5t225 -150t150 -225t55.5 -273.5q0 -220 -124 -399l343 -343q37 -37 37 -90z" />
+    <glyph glyph-name="zoom_out" unicode="&#xf010;" horiz-adv-x="1664" 
+d="M1024 736v-64q0 -13 -9.5 -22.5t-22.5 -9.5h-576q-13 0 -22.5 9.5t-9.5 22.5v64q0 13 9.5 22.5t22.5 9.5h576q13 0 22.5 -9.5t9.5 -22.5zM1152 704q0 185 -131.5 316.5t-316.5 131.5t-316.5 -131.5t-131.5 -316.5t131.5 -316.5t316.5 -131.5t316.5 131.5t131.5 316.5z
+M1664 -128q0 -53 -37.5 -90.5t-90.5 -37.5q-54 0 -90 38l-343 342q-179 -124 -399 -124q-143 0 -273.5 55.5t-225 150t-150 225t-55.5 273.5t55.5 273.5t150 225t225 150t273.5 55.5t273.5 -55.5t225 -150t150 -225t55.5 -273.5q0 -220 -124 -399l343 -343q37 -37 37 -90z
+" />
+    <glyph glyph-name="off" unicode="&#xf011;" 
+d="M1536 640q0 -156 -61 -298t-164 -245t-245 -164t-298 -61t-298 61t-245 164t-164 245t-61 298q0 182 80.5 343t226.5 270q43 32 95.5 25t83.5 -50q32 -42 24.5 -94.5t-49.5 -84.5q-98 -74 -151.5 -181t-53.5 -228q0 -104 40.5 -198.5t109.5 -163.5t163.5 -109.5
+t198.5 -40.5t198.5 40.5t163.5 109.5t109.5 163.5t40.5 198.5q0 121 -53.5 228t-151.5 181q-42 32 -49.5 84.5t24.5 94.5q31 43 84 50t95 -25q146 -109 226.5 -270t80.5 -343zM896 1408v-640q0 -52 -38 -90t-90 -38t-90 38t-38 90v640q0 52 38 90t90 38t90 -38t38 -90z" />
+    <glyph glyph-name="signal" unicode="&#xf012;" horiz-adv-x="1792" 
+d="M256 96v-192q0 -14 -9 -23t-23 -9h-192q-14 0 -23 9t-9 23v192q0 14 9 23t23 9h192q14 0 23 -9t9 -23zM640 224v-320q0 -14 -9 -23t-23 -9h-192q-14 0 -23 9t-9 23v320q0 14 9 23t23 9h192q14 0 23 -9t9 -23zM1024 480v-576q0 -14 -9 -23t-23 -9h-192q-14 0 -23 9t-9 23
+v576q0 14 9 23t23 9h192q14 0 23 -9t9 -23zM1408 864v-960q0 -14 -9 -23t-23 -9h-192q-14 0 -23 9t-9 23v960q0 14 9 23t23 9h192q14 0 23 -9t9 -23zM1792 1376v-1472q0 -14 -9 -23t-23 -9h-192q-14 0 -23 9t-9 23v1472q0 14 9 23t23 9h192q14 0 23 -9t9 -23z" />
+    <glyph glyph-name="cog" unicode="&#xf013;" 
+d="M1024 640q0 106 -75 181t-181 75t-181 -75t-75 -181t75 -181t181 -75t181 75t75 181zM1536 749v-222q0 -12 -8 -23t-20 -13l-185 -28q-19 -54 -39 -91q35 -50 107 -138q10 -12 10 -25t-9 -23q-27 -37 -99 -108t-94 -71q-12 0 -26 9l-138 108q-44 -23 -91 -38
+q-16 -136 -29 -186q-7 -28 -36 -28h-222q-14 0 -24.5 8.5t-11.5 21.5l-28 184q-49 16 -90 37l-141 -107q-10 -9 -25 -9q-14 0 -25 11q-126 114 -165 168q-7 10 -7 23q0 12 8 23q15 21 51 66.5t54 70.5q-27 50 -41 99l-183 27q-13 2 -21 12.5t-8 23.5v222q0 12 8 23t19 13
+l186 28q14 46 39 92q-40 57 -107 138q-10 12 -10 24q0 10 9 23q26 36 98.5 107.5t94.5 71.5q13 0 26 -10l138 -107q44 23 91 38q16 136 29 186q7 28 36 28h222q14 0 24.5 -8.5t11.5 -21.5l28 -184q49 -16 90 -37l142 107q9 9 24 9q13 0 25 -10q129 -119 165 -170q7 -8 7 -22
+q0 -12 -8 -23q-15 -21 -51 -66.5t-54 -70.5q26 -50 41 -98l183 -28q13 -2 21 -12.5t8 -23.5z" />
+    <glyph glyph-name="trash" unicode="&#xf014;" horiz-adv-x="1408" 
+d="M512 800v-576q0 -14 -9 -23t-23 -9h-64q-14 0 -23 9t-9 23v576q0 14 9 23t23 9h64q14 0 23 -9t9 -23zM768 800v-576q0 -14 -9 -23t-23 -9h-64q-14 0 -23 9t-9 23v576q0 14 9 23t23 9h64q14 0 23 -9t9 -23zM1024 800v-576q0 -14 -9 -23t-23 -9h-64q-14 0 -23 9t-9 23v576
+q0 14 9 23t23 9h64q14 0 23 -9t9 -23zM1152 76v948h-896v-948q0 -22 7 -40.5t14.5 -27t10.5 -8.5h832q3 0 10.5 8.5t14.5 27t7 40.5zM480 1152h448l-48 117q-7 9 -17 11h-317q-10 -2 -17 -11zM1408 1120v-64q0 -14 -9 -23t-23 -9h-96v-948q0 -83 -47 -143.5t-113 -60.5h-832
+q-66 0 -113 58.5t-47 141.5v952h-96q-14 0 -23 9t-9 23v64q0 14 9 23t23 9h309l70 167q15 37 54 63t79 26h320q40 0 79 -26t54 -63l70 -167h309q14 0 23 -9t9 -23z" />
+    <glyph glyph-name="home" unicode="&#xf015;" horiz-adv-x="1664" 
+d="M1408 544v-480q0 -26 -19 -45t-45 -19h-384v384h-256v-384h-384q-26 0 -45 19t-19 45v480q0 1 0.5 3t0.5 3l575 474l575 -474q1 -2 1 -6zM1631 613l-62 -74q-8 -9 -21 -11h-3q-13 0 -21 7l-692 577l-692 -577q-12 -8 -24 -7q-13 2 -21 11l-62 74q-8 10 -7 23.5t11 21.5
+l719 599q32 26 76 26t76 -26l244 -204v195q0 14 9 23t23 9h192q14 0 23 -9t9 -23v-408l219 -182q10 -8 11 -21.5t-7 -23.5z" />
+    <glyph glyph-name="file_alt" unicode="&#xf016;" 
+d="M1468 1156q28 -28 48 -76t20 -88v-1152q0 -40 -28 -68t-68 -28h-1344q-40 0 -68 28t-28 68v1600q0 40 28 68t68 28h896q40 0 88 -20t76 -48zM1024 1400v-376h376q-10 29 -22 41l-313 313q-12 12 -41 22zM1408 -128v1024h-416q-40 0 -68 28t-28 68v416h-768v-1536h1280z
+" />
+    <glyph glyph-name="time" unicode="&#xf017;" 
+d="M896 992v-448q0 -14 -9 -23t-23 -9h-320q-14 0 -23 9t-9 23v64q0 14 9 23t23 9h224v352q0 14 9 23t23 9h64q14 0 23 -9t9 -23zM1312 640q0 148 -73 273t-198 198t-273 73t-273 -73t-198 -198t-73 -273t73 -273t198 -198t273 -73t273 73t198 198t73 273zM1536 640
+q0 -209 -103 -385.5t-279.5 -279.5t-385.5 -103t-385.5 103t-279.5 279.5t-103 385.5t103 385.5t279.5 279.5t385.5 103t385.5 -103t279.5 -279.5t103 -385.5z" />
+    <glyph glyph-name="road" unicode="&#xf018;" horiz-adv-x="1920" 
+d="M1111 540v4l-24 320q-1 13 -11 22.5t-23 9.5h-186q-13 0 -23 -9.5t-11 -22.5l-24 -320v-4q-1 -12 8 -20t21 -8h244q12 0 21 8t8 20zM1870 73q0 -73 -46 -73h-704q13 0 22 9.5t8 22.5l-20 256q-1 13 -11 22.5t-23 9.5h-272q-13 0 -23 -9.5t-11 -22.5l-20 -256
+q-1 -13 8 -22.5t22 -9.5h-704q-46 0 -46 73q0 54 26 116l417 1044q8 19 26 33t38 14h339q-13 0 -23 -9.5t-11 -22.5l-15 -192q-1 -14 8 -23t22 -9h166q13 0 22 9t8 23l-15 192q-1 13 -11 22.5t-23 9.5h339q20 0 38 -14t26 -33l417 -1044q26 -62 26 -116z" />
+    <glyph glyph-name="download_alt" unicode="&#xf019;" horiz-adv-x="1664" 
+d="M1280 192q0 26 -19 45t-45 19t-45 -19t-19 -45t19 -45t45 -19t45 19t19 45zM1536 192q0 26 -19 45t-45 19t-45 -19t-19 -45t19 -45t45 -19t45 19t19 45zM1664 416v-320q0 -40 -28 -68t-68 -28h-1472q-40 0 -68 28t-28 68v320q0 40 28 68t68 28h465l135 -136
+q58 -56 136 -56t136 56l136 136h464q40 0 68 -28t28 -68zM1339 985q17 -41 -14 -70l-448 -448q-18 -19 -45 -19t-45 19l-448 448q-31 29 -14 70q17 39 59 39h256v448q0 26 19 45t45 19h256q26 0 45 -19t19 -45v-448h256q42 0 59 -39z" />
+    <glyph glyph-name="download" unicode="&#xf01a;" 
+d="M1120 608q0 -12 -10 -24l-319 -319q-11 -9 -23 -9t-23 9l-320 320q-15 16 -7 35q8 20 30 20h192v352q0 14 9 23t23 9h192q14 0 23 -9t9 -23v-352h192q14 0 23 -9t9 -23zM768 1184q-148 0 -273 -73t-198 -198t-73 -273t73 -273t198 -198t273 -73t273 73t198 198t73 273
+t-73 273t-198 198t-273 73zM1536 640q0 -209 -103 -385.5t-279.5 -279.5t-385.5 -103t-385.5 103t-279.5 279.5t-103 385.5t103 385.5t279.5 279.5t385.5 103t385.5 -103t279.5 -279.5t103 -385.5z" />
+    <glyph glyph-name="upload" unicode="&#xf01b;" 
+d="M1118 660q-8 -20 -30 -20h-192v-352q0 -14 -9 -23t-23 -9h-192q-14 0 -23 9t-9 23v352h-192q-14 0 -23 9t-9 23q0 12 10 24l319 319q11 9 23 9t23 -9l320 -320q15 -16 7 -35zM768 1184q-148 0 -273 -73t-198 -198t-73 -273t73 -273t198 -198t273 -73t273 73t198 198
+t73 273t-73 273t-198 198t-273 73zM1536 640q0 -209 -103 -385.5t-279.5 -279.5t-385.5 -103t-385.5 103t-279.5 279.5t-103 385.5t103 385.5t279.5 279.5t385.5 103t385.5 -103t279.5 -279.5t103 -385.5z" />
+    <glyph glyph-name="inbox" unicode="&#xf01c;" 
+d="M1023 576h316q-1 3 -2.5 8.5t-2.5 7.5l-212 496h-708l-212 -496q-1 -3 -2.5 -8.5t-2.5 -7.5h316l95 -192h320zM1536 546v-482q0 -26 -19 -45t-45 -19h-1408q-26 0 -45 19t-19 45v482q0 62 25 123l238 552q10 25 36.5 42t52.5 17h832q26 0 52.5 -17t36.5 -42l238 -552
+q25 -61 25 -123z" />
+    <glyph glyph-name="play_circle" unicode="&#xf01d;" 
+d="M1184 640q0 -37 -32 -55l-544 -320q-15 -9 -32 -9q-16 0 -32 8q-32 19 -32 56v640q0 37 32 56q33 18 64 -1l544 -320q32 -18 32 -55zM1312 640q0 148 -73 273t-198 198t-273 73t-273 -73t-198 -198t-73 -273t73 -273t198 -198t273 -73t273 73t198 198t73 273zM1536 640
+q0 -209 -103 -385.5t-279.5 -279.5t-385.5 -103t-385.5 103t-279.5 279.5t-103 385.5t103 385.5t279.5 279.5t385.5 103t385.5 -103t279.5 -279.5t103 -385.5z" />
+    <glyph glyph-name="repeat" unicode="&#xf01e;" 
+d="M1536 1280v-448q0 -26 -19 -45t-45 -19h-448q-42 0 -59 40q-17 39 14 69l138 138q-148 137 -349 137q-104 0 -198.5 -40.5t-163.5 -109.5t-109.5 -163.5t-40.5 -198.5t40.5 -198.5t109.5 -163.5t163.5 -109.5t198.5 -40.5q119 0 225 52t179 147q7 10 23 12q15 0 25 -9
+l137 -138q9 -8 9.5 -20.5t-7.5 -22.5q-109 -132 -264 -204.5t-327 -72.5q-156 0 -298 61t-245 164t-164 245t-61 298t61 298t164 245t245 164t298 61q147 0 284.5 -55.5t244.5 -156.5l130 129q29 31 70 14q39 -17 39 -59z" />
+    <glyph glyph-name="refresh" unicode="&#xf021;" 
+d="M1511 480q0 -5 -1 -7q-64 -268 -268 -434.5t-478 -166.5q-146 0 -282.5 55t-243.5 157l-129 -129q-19 -19 -45 -19t-45 19t-19 45v448q0 26 19 45t45 19h448q26 0 45 -19t19 -45t-19 -45l-137 -137q71 -66 161 -102t187 -36q134 0 250 65t186 179q11 17 53 117
+q8 23 30 23h192q13 0 22.5 -9.5t9.5 -22.5zM1536 1280v-448q0 -26 -19 -45t-45 -19h-448q-26 0 -45 19t-19 45t19 45l138 138q-148 137 -349 137q-134 0 -250 -65t-186 -179q-11 -17 -53 -117q-8 -23 -30 -23h-199q-13 0 -22.5 9.5t-9.5 22.5v7q65 268 270 434.5t480 166.5
+q146 0 284 -55.5t245 -156.5l130 129q19 19 45 19t45 -19t19 -45z" />
+    <glyph glyph-name="list_alt" unicode="&#xf022;" horiz-adv-x="1792" 
+d="M384 352v-64q0 -13 -9.5 -22.5t-22.5 -9.5h-64q-13 0 -22.5 9.5t-9.5 22.5v64q0 13 9.5 22.5t22.5 9.5h64q13 0 22.5 -9.5t9.5 -22.5zM384 608v-64q0 -13 -9.5 -22.5t-22.5 -9.5h-64q-13 0 -22.5 9.5t-9.5 22.5v64q0 13 9.5 22.5t22.5 9.5h64q13 0 22.5 -9.5t9.5 -22.5z
+M384 864v-64q0 -13 -9.5 -22.5t-22.5 -9.5h-64q-13 0 -22.5 9.5t-9.5 22.5v64q0 13 9.5 22.5t22.5 9.5h64q13 0 22.5 -9.5t9.5 -22.5zM1536 352v-64q0 -13 -9.5 -22.5t-22.5 -9.5h-960q-13 0 -22.5 9.5t-9.5 22.5v64q0 13 9.5 22.5t22.5 9.5h960q13 0 22.5 -9.5t9.5 -22.5z
+M1536 608v-64q0 -13 -9.5 -22.5t-22.5 -9.5h-960q-13 0 -22.5 9.5t-9.5 22.5v64q0 13 9.5 22.5t22.5 9.5h960q13 0 22.5 -9.5t9.5 -22.5zM1536 864v-64q0 -13 -9.5 -22.5t-22.5 -9.5h-960q-13 0 -22.5 9.5t-9.5 22.5v64q0 13 9.5 22.5t22.5 9.5h960q13 0 22.5 -9.5
+t9.5 -22.5zM1664 160v832q0 13 -9.5 22.5t-22.5 9.5h-1472q-13 0 -22.5 -9.5t-9.5 -22.5v-832q0 -13 9.5 -22.5t22.5 -9.5h1472q13 0 22.5 9.5t9.5 22.5zM1792 1248v-1088q0 -66 -47 -113t-113 -47h-1472q-66 0 -113 47t-47 113v1088q0 66 47 113t113 47h1472q66 0 113 -47
+t47 -113z" />
+    <glyph glyph-name="lock" unicode="&#xf023;" horiz-adv-x="1152" 
+d="M320 768h512v192q0 106 -75 181t-181 75t-181 -75t-75 -181v-192zM1152 672v-576q0 -40 -28 -68t-68 -28h-960q-40 0 -68 28t-28 68v576q0 40 28 68t68 28h32v192q0 184 132 316t316 132t316 -132t132 -316v-192h32q40 0 68 -28t28 -68z" />
+    <glyph glyph-name="flag" unicode="&#xf024;" horiz-adv-x="1792" 
+d="M320 1280q0 -72 -64 -110v-1266q0 -13 -9.5 -22.5t-22.5 -9.5h-64q-13 0 -22.5 9.5t-9.5 22.5v1266q-64 38 -64 110q0 53 37.5 90.5t90.5 37.5t90.5 -37.5t37.5 -90.5zM1792 1216v-763q0 -25 -12.5 -38.5t-39.5 -27.5q-215 -116 -369 -116q-61 0 -123.5 22t-108.5 48
+t-115.5 48t-142.5 22q-192 0 -464 -146q-17 -9 -33 -9q-26 0 -45 19t-19 45v742q0 32 31 55q21 14 79 43q236 120 421 120q107 0 200 -29t219 -88q38 -19 88 -19q54 0 117.5 21t110 47t88 47t54.5 21q26 0 45 -19t19 -45z" />
+    <glyph glyph-name="headphones" unicode="&#xf025;" horiz-adv-x="1664" 
+d="M1664 650q0 -166 -60 -314l-20 -49l-185 -33q-22 -83 -90.5 -136.5t-156.5 -53.5v-32q0 -14 -9 -23t-23 -9h-64q-14 0 -23 9t-9 23v576q0 14 9 23t23 9h64q14 0 23 -9t9 -23v-32q71 0 130 -35.5t93 -95.5l68 12q29 95 29 193q0 148 -88 279t-236.5 209t-315.5 78
+t-315.5 -78t-236.5 -209t-88 -279q0 -98 29 -193l68 -12q34 60 93 95.5t130 35.5v32q0 14 9 23t23 9h64q14 0 23 -9t9 -23v-576q0 -14 -9 -23t-23 -9h-64q-14 0 -23 9t-9 23v32q-88 0 -156.5 53.5t-90.5 136.5l-185 33l-20 49q-60 148 -60 314q0 151 67 291t179 242.5
+t266 163.5t320 61t320 -61t266 -163.5t179 -242.5t67 -291z" />
+    <glyph glyph-name="volume_off" unicode="&#xf026;" horiz-adv-x="768" 
+d="M768 1184v-1088q0 -26 -19 -45t-45 -19t-45 19l-333 333h-262q-26 0 -45 19t-19 45v384q0 26 19 45t45 19h262l333 333q19 19 45 19t45 -19t19 -45z" />
+    <glyph glyph-name="volume_down" unicode="&#xf027;" horiz-adv-x="1152" 
+d="M768 1184v-1088q0 -26 -19 -45t-45 -19t-45 19l-333 333h-262q-26 0 -45 19t-19 45v384q0 26 19 45t45 19h262l333 333q19 19 45 19t45 -19t19 -45zM1152 640q0 -76 -42.5 -141.5t-112.5 -93.5q-10 -5 -25 -5q-26 0 -45 18.5t-19 45.5q0 21 12 35.5t29 25t34 23t29 36
+t12 56.5t-12 56.5t-29 36t-34 23t-29 25t-12 35.5q0 27 19 45.5t45 18.5q15 0 25 -5q70 -27 112.5 -93t42.5 -142z" />
+    <glyph glyph-name="volume_up" unicode="&#xf028;" horiz-adv-x="1664" 
+d="M768 1184v-1088q0 -26 -19 -45t-45 -19t-45 19l-333 333h-262q-26 0 -45 19t-19 45v384q0 26 19 45t45 19h262l333 333q19 19 45 19t45 -19t19 -45zM1152 640q0 -76 -42.5 -141.5t-112.5 -93.5q-10 -5 -25 -5q-26 0 -45 18.5t-19 45.5q0 21 12 35.5t29 25t34 23t29 36
+t12 56.5t-12 56.5t-29 36t-34 23t-29 25t-12 35.5q0 27 19 45.5t45 18.5q15 0 25 -5q70 -27 112.5 -93t42.5 -142zM1408 640q0 -153 -85 -282.5t-225 -188.5q-13 -5 -25 -5q-27 0 -46 19t-19 45q0 39 39 59q56 29 76 44q74 54 115.5 135.5t41.5 173.5t-41.5 173.5
+t-115.5 135.5q-20 15 -76 44q-39 20 -39 59q0 26 19 45t45 19q13 0 26 -5q140 -59 225 -188.5t85 -282.5zM1664 640q0 -230 -127 -422.5t-338 -283.5q-13 -5 -26 -5q-26 0 -45 19t-19 45q0 36 39 59q7 4 22.5 10.5t22.5 10.5q46 25 82 51q123 91 192 227t69 289t-69 289
+t-192 227q-36 26 -82 51q-7 4 -22.5 10.5t-22.5 10.5q-39 23 -39 59q0 26 19 45t45 19q13 0 26 -5q211 -91 338 -283.5t127 -422.5z" />
+    <glyph glyph-name="qrcode" unicode="&#xf029;" horiz-adv-x="1408" 
+d="M384 384v-128h-128v128h128zM384 1152v-128h-128v128h128zM1152 1152v-128h-128v128h128zM128 129h384v383h-384v-383zM128 896h384v384h-384v-384zM896 896h384v384h-384v-384zM640 640v-640h-640v640h640zM1152 128v-128h-128v128h128zM1408 128v-128h-128v128h128z
+M1408 640v-384h-384v128h-128v-384h-128v640h384v-128h128v128h128zM640 1408v-640h-640v640h640zM1408 1408v-640h-640v640h640z" />
+    <glyph glyph-name="barcode" unicode="&#xf02a;" horiz-adv-x="1792" 
+d="M63 0h-63v1408h63v-1408zM126 1h-32v1407h32v-1407zM220 1h-31v1407h31v-1407zM377 1h-31v1407h31v-1407zM534 1h-62v1407h62v-1407zM660 1h-31v1407h31v-1407zM723 1h-31v1407h31v-1407zM786 1h-31v1407h31v-1407zM943 1h-63v1407h63v-1407zM1100 1h-63v1407h63v-1407z
+M1226 1h-63v1407h63v-1407zM1352 1h-63v1407h63v-1407zM1446 1h-63v1407h63v-1407zM1635 1h-94v1407h94v-1407zM1698 1h-32v1407h32v-1407zM1792 0h-63v1408h63v-1408z" />
+    <glyph glyph-name="tag" unicode="&#xf02b;" 
+d="M448 1088q0 53 -37.5 90.5t-90.5 37.5t-90.5 -37.5t-37.5 -90.5t37.5 -90.5t90.5 -37.5t90.5 37.5t37.5 90.5zM1515 512q0 -53 -37 -90l-491 -492q-39 -37 -91 -37q-53 0 -90 37l-715 716q-38 37 -64.5 101t-26.5 117v416q0 52 38 90t90 38h416q53 0 117 -26.5t102 -64.5
+l715 -714q37 -39 37 -91z" />
+    <glyph glyph-name="tags" unicode="&#xf02c;" horiz-adv-x="1920" 
+d="M448 1088q0 53 -37.5 90.5t-90.5 37.5t-90.5 -37.5t-37.5 -90.5t37.5 -90.5t90.5 -37.5t90.5 37.5t37.5 90.5zM1515 512q0 -53 -37 -90l-491 -492q-39 -37 -91 -37q-53 0 -90 37l-715 716q-38 37 -64.5 101t-26.5 117v416q0 52 38 90t90 38h416q53 0 117 -26.5t102 -64.5
+l715 -714q37 -39 37 -91zM1899 512q0 -53 -37 -90l-491 -492q-39 -37 -91 -37q-36 0 -59 14t-53 45l470 470q37 37 37 90q0 52 -37 91l-715 714q-38 38 -102 64.5t-117 26.5h224q53 0 117 -26.5t102 -64.5l715 -714q37 -39 37 -91z" />
+    <glyph glyph-name="book" unicode="&#xf02d;" horiz-adv-x="1664" 
+d="M1639 1058q40 -57 18 -129l-275 -906q-19 -64 -76.5 -107.5t-122.5 -43.5h-923q-77 0 -148.5 53.5t-99.5 131.5q-24 67 -2 127q0 4 3 27t4 37q1 8 -3 21.5t-3 19.5q2 11 8 21t16.5 23.5t16.5 23.5q23 38 45 91.5t30 91.5q3 10 0.5 30t-0.5 28q3 11 17 28t17 23
+q21 36 42 92t25 90q1 9 -2.5 32t0.5 28q4 13 22 30.5t22 22.5q19 26 42.5 84.5t27.5 96.5q1 8 -3 25.5t-2 26.5q2 8 9 18t18 23t17 21q8 12 16.5 30.5t15 35t16 36t19.5 32t26.5 23.5t36 11.5t47.5 -5.5l-1 -3q38 9 51 9h761q74 0 114 -56t18 -130l-274 -906
+q-36 -119 -71.5 -153.5t-128.5 -34.5h-869q-27 0 -38 -15q-11 -16 -1 -43q24 -70 144 -70h923q29 0 56 15.5t35 41.5l300 987q7 22 5 57q38 -15 59 -43zM575 1056q-4 -13 2 -22.5t20 -9.5h608q13 0 25.5 9.5t16.5 22.5l21 64q4 13 -2 22.5t-20 9.5h-608q-13 0 -25.5 -9.5
+t-16.5 -22.5zM492 800q-4 -13 2 -22.5t20 -9.5h608q13 0 25.5 9.5t16.5 22.5l21 64q4 13 -2 22.5t-20 9.5h-608q-13 0 -25.5 -9.5t-16.5 -22.5z" />
+    <glyph glyph-name="bookmark" unicode="&#xf02e;" horiz-adv-x="1280" 
+d="M1164 1408q23 0 44 -9q33 -13 52.5 -41t19.5 -62v-1289q0 -34 -19.5 -62t-52.5 -41q-19 -8 -44 -8q-48 0 -83 32l-441 424l-441 -424q-36 -33 -83 -33q-23 0 -44 9q-33 13 -52.5 41t-19.5 62v1289q0 34 19.5 62t52.5 41q21 9 44 9h1048z" />
+    <glyph glyph-name="print" unicode="&#xf02f;" horiz-adv-x="1664" 
+d="M384 0h896v256h-896v-256zM384 640h896v384h-160q-40 0 -68 28t-28 68v160h-640v-640zM1536 576q0 26 -19 45t-45 19t-45 -19t-19 -45t19 -45t45 -19t45 19t19 45zM1664 576v-416q0 -13 -9.5 -22.5t-22.5 -9.5h-224v-160q0 -40 -28 -68t-68 -28h-960q-40 0 -68 28t-28 68
+v160h-224q-13 0 -22.5 9.5t-9.5 22.5v416q0 79 56.5 135.5t135.5 56.5h64v544q0 40 28 68t68 28h672q40 0 88 -20t76 -48l152 -152q28 -28 48 -76t20 -88v-256h64q79 0 135.5 -56.5t56.5 -135.5z" />
+    <glyph glyph-name="camera" unicode="&#xf030;" horiz-adv-x="1920" 
+d="M960 864q119 0 203.5 -84.5t84.5 -203.5t-84.5 -203.5t-203.5 -84.5t-203.5 84.5t-84.5 203.5t84.5 203.5t203.5 84.5zM1664 1280q106 0 181 -75t75 -181v-896q0 -106 -75 -181t-181 -75h-1408q-106 0 -181 75t-75 181v896q0 106 75 181t181 75h224l51 136
+q19 49 69.5 84.5t103.5 35.5h512q53 0 103.5 -35.5t69.5 -84.5l51 -136h224zM960 128q185 0 316.5 131.5t131.5 316.5t-131.5 316.5t-316.5 131.5t-316.5 -131.5t-131.5 -316.5t131.5 -316.5t316.5 -131.5z" />
+    <glyph glyph-name="font" unicode="&#xf031;" horiz-adv-x="1664" 
+d="M725 977l-170 -450q33 0 136.5 -2t160.5 -2q19 0 57 2q-87 253 -184 452zM0 -128l2 79q23 7 56 12.5t57 10.5t49.5 14.5t44.5 29t31 50.5l237 616l280 724h75h53q8 -14 11 -21l205 -480q33 -78 106 -257.5t114 -274.5q15 -34 58 -144.5t72 -168.5q20 -45 35 -57
+q19 -15 88 -29.5t84 -20.5q6 -38 6 -57q0 -5 -0.5 -13.5t-0.5 -12.5q-63 0 -190 8t-191 8q-76 0 -215 -7t-178 -8q0 43 4 78l131 28q1 0 12.5 2.5t15.5 3.5t14.5 4.5t15 6.5t11 8t9 11t2.5 14q0 16 -31 96.5t-72 177.5t-42 100l-450 2q-26 -58 -76.5 -195.5t-50.5 -162.5
+q0 -22 14 -37.5t43.5 -24.5t48.5 -13.5t57 -8.5t41 -4q1 -19 1 -58q0 -9 -2 -27q-58 0 -174.5 10t-174.5 10q-8 0 -26.5 -4t-21.5 -4q-80 -14 -188 -14z" />
+    <glyph glyph-name="bold" unicode="&#xf032;" horiz-adv-x="1408" 
+d="M555 15q74 -32 140 -32q376 0 376 335q0 114 -41 180q-27 44 -61.5 74t-67.5 46.5t-80.5 25t-84 10.5t-94.5 2q-73 0 -101 -10q0 -53 -0.5 -159t-0.5 -158q0 -8 -1 -67.5t-0.5 -96.5t4.5 -83.5t12 -66.5zM541 761q42 -7 109 -7q82 0 143 13t110 44.5t74.5 89.5t25.5 142
+q0 70 -29 122.5t-79 82t-108 43.5t-124 14q-50 0 -130 -13q0 -50 4 -151t4 -152q0 -27 -0.5 -80t-0.5 -79q0 -46 1 -69zM0 -128l2 94q15 4 85 16t106 27q7 12 12.5 27t8.5 33.5t5.5 32.5t3 37.5t0.5 34v35.5v30q0 982 -22 1025q-4 8 -22 14.5t-44.5 11t-49.5 7t-48.5 4.5
+t-30.5 3l-4 83q98 2 340 11.5t373 9.5q23 0 68 -0.5t68 -0.5q70 0 136.5 -13t128.5 -42t108 -71t74 -104.5t28 -137.5q0 -52 -16.5 -95.5t-39 -72t-64.5 -57.5t-73 -45t-84 -40q154 -35 256.5 -134t102.5 -248q0 -100 -35 -179.5t-93.5 -130.5t-138 -85.5t-163.5 -48.5
+t-176 -14q-44 0 -132 3t-132 3q-106 0 -307 -11t-231 -12z" />
+    <glyph glyph-name="italic" unicode="&#xf033;" horiz-adv-x="1024" 
+d="M0 -126l17 85q22 7 61.5 16.5t72 19t59.5 23.5q28 35 41 101q1 7 62 289t114 543.5t52 296.5v25q-24 13 -54.5 18.5t-69.5 8t-58 5.5l19 103q33 -2 120 -6.5t149.5 -7t120.5 -2.5q48 0 98.5 2.5t121 7t98.5 6.5q-5 -39 -19 -89q-30 -10 -101.5 -28.5t-108.5 -33.5
+q-8 -19 -14 -42.5t-9 -40t-7.5 -45.5t-6.5 -42q-27 -148 -87.5 -419.5t-77.5 -355.5q-2 -9 -13 -58t-20 -90t-16 -83.5t-6 -57.5l1 -18q17 -4 185 -31q-3 -44 -16 -99q-11 0 -32.5 -1.5t-32.5 -1.5q-29 0 -87 10t-86 10q-138 2 -206 2q-51 0 -143 -9t-121 -11z" />
+    <glyph glyph-name="text_height" unicode="&#xf034;" horiz-adv-x="1792" 
+d="M1744 128q33 0 42 -18.5t-11 -44.5l-126 -162q-20 -26 -49 -26t-49 26l-126 162q-20 26 -11 44.5t42 18.5h80v1024h-80q-33 0 -42 18.5t11 44.5l126 162q20 26 49 26t49 -26l126 -162q20 -26 11 -44.5t-42 -18.5h-80v-1024h80zM81 1407l54 -27q12 -5 211 -5q44 0 132 2
+t132 2q36 0 107.5 -0.5t107.5 -0.5h293q6 0 21 -0.5t20.5 0t16 3t17.5 9t15 17.5l42 1q4 0 14 -0.5t14 -0.5q2 -112 2 -336q0 -80 -5 -109q-39 -14 -68 -18q-25 44 -54 128q-3 9 -11 48t-14.5 73.5t-7.5 35.5q-6 8 -12 12.5t-15.5 6t-13 2.5t-18 0.5t-16.5 -0.5
+q-17 0 -66.5 0.5t-74.5 0.5t-64 -2t-71 -6q-9 -81 -8 -136q0 -94 2 -388t2 -455q0 -16 -2.5 -71.5t0 -91.5t12.5 -69q40 -21 124 -42.5t120 -37.5q5 -40 5 -50q0 -14 -3 -29l-34 -1q-76 -2 -218 8t-207 10q-50 0 -151 -9t-152 -9q-3 51 -3 52v9q17 27 61.5 43t98.5 29t78 27
+q19 42 19 383q0 101 -3 303t-3 303v117q0 2 0.5 15.5t0.5 25t-1 25.5t-3 24t-5 14q-11 12 -162 12q-33 0 -93 -12t-80 -26q-19 -13 -34 -72.5t-31.5 -111t-42.5 -53.5q-42 26 -56 44v383z" />
+    <glyph glyph-name="text_width" unicode="&#xf035;" 
+d="M81 1407l54 -27q12 -5 211 -5q44 0 132 2t132 2q70 0 246.5 1t304.5 0.5t247 -4.5q33 -1 56 31l42 1q4 0 14 -0.5t14 -0.5q2 -112 2 -336q0 -80 -5 -109q-39 -14 -68 -18q-25 44 -54 128q-3 9 -11 47.5t-15 73.5t-7 36q-10 13 -27 19q-5 2 -66 2q-30 0 -93 1t-103 1
+t-94 -2t-96 -7q-9 -81 -8 -136l1 -152v52q0 -55 1 -154t1.5 -180t0.5 -153q0 -16 -2.5 -71.5t0 -91.5t12.5 -69q40 -21 124 -42.5t120 -37.5q5 -40 5 -50q0 -14 -3 -29l-34 -1q-76 -2 -218 8t-207 10q-50 0 -151 -9t-152 -9q-3 51 -3 52v9q17 27 61.5 43t98.5 29t78 27
+q7 16 11.5 74t6 145.5t1.5 155t-0.5 153.5t-0.5 89q0 7 -2.5 21.5t-2.5 22.5q0 7 0.5 44t1 73t0 76.5t-3 67.5t-6.5 32q-11 12 -162 12q-41 0 -163 -13.5t-138 -24.5q-19 -12 -34 -71.5t-31.5 -111.5t-42.5 -54q-42 26 -56 44v383zM1310 125q12 0 42 -19.5t57.5 -41.5
+t59.5 -49t36 -30q26 -21 26 -49t-26 -49q-4 -3 -36 -30t-59.5 -49t-57.5 -41.5t-42 -19.5q-13 0 -20.5 10.5t-10 28.5t-2.5 33.5t1.5 33t1.5 19.5h-1024q0 -2 1.5 -19.5t1.5 -33t-2.5 -33.5t-10 -28.5t-20.5 -10.5q-12 0 -42 19.5t-57.5 41.5t-59.5 49t-36 30q-26 21 -26 49
+t26 49q4 3 36 30t59.5 49t57.5 41.5t42 19.5q13 0 20.5 -10.5t10 -28.5t2.5 -33.5t-1.5 -33t-1.5 -19.5h1024q0 2 -1.5 19.5t-1.5 33t2.5 33.5t10 28.5t20.5 10.5z" />
+    <glyph glyph-name="align_left" unicode="&#xf036;" horiz-adv-x="1792" 
+d="M1792 192v-128q0 -26 -19 -45t-45 -19h-1664q-26 0 -45 19t-19 45v128q0 26 19 45t45 19h1664q26 0 45 -19t19 -45zM1408 576v-128q0 -26 -19 -45t-45 -19h-1280q-26 0 -45 19t-19 45v128q0 26 19 45t45 19h1280q26 0 45 -19t19 -45zM1664 960v-128q0 -26 -19 -45
+t-45 -19h-1536q-26 0 -45 19t-19 45v128q0 26 19 45t45 19h1536q26 0 45 -19t19 -45zM1280 1344v-128q0 -26 -19 -45t-45 -19h-1152q-26 0 -45 19t-19 45v128q0 26 19 45t45 19h1152q26 0 45 -19t19 -45z" />
+    <glyph glyph-name="align_center" unicode="&#xf037;" horiz-adv-x="1792" 
+d="M1792 192v-128q0 -26 -19 -45t-45 -19h-1664q-26 0 -45 19t-19 45v128q0 26 19 45t45 19h1664q26 0 45 -19t19 -45zM1408 576v-128q0 -26 -19 -45t-45 -19h-896q-26 0 -45 19t-19 45v128q0 26 19 45t45 19h896q26 0 45 -19t19 -45zM1664 960v-128q0 -26 -19 -45t-45 -19
+h-1408q-26 0 -45 19t-19 45v128q0 26 19 45t45 19h1408q26 0 45 -19t19 -45zM1280 1344v-128q0 -26 -19 -45t-45 -19h-640q-26 0 -45 19t-19 45v128q0 26 19 45t45 19h640q26 0 45 -19t19 -45z" />
+    <glyph glyph-name="align_right" unicode="&#xf038;" horiz-adv-x="1792" 
+d="M1792 192v-128q0 -26 -19 -45t-45 -19h-1664q-26 0 -45 19t-19 45v128q0 26 19 45t45 19h1664q26 0 45 -19t19 -45zM1792 576v-128q0 -26 -19 -45t-45 -19h-1280q-26 0 -45 19t-19 45v128q0 26 19 45t45 19h1280q26 0 45 -19t19 -45zM1792 960v-128q0 -26 -19 -45
+t-45 -19h-1536q-26 0 -45 19t-19 45v128q0 26 19 45t45 19h1536q26 0 45 -19t19 -45zM1792 1344v-128q0 -26 -19 -45t-45 -19h-1152q-26 0 -45 19t-19 45v128q0 26 19 45t45 19h1152q26 0 45 -19t19 -45z" />
+    <glyph glyph-name="align_justify" unicode="&#xf039;" horiz-adv-x="1792" 
+d="M1792 192v-128q0 -26 -19 -45t-45 -19h-1664q-26 0 -45 19t-19 45v128q0 26 19 45t45 19h1664q26 0 45 -19t19 -45zM1792 576v-128q0 -26 -19 -45t-45 -19h-1664q-26 0 -45 19t-19 45v128q0 26 19 45t45 19h1664q26 0 45 -19t19 -45zM1792 960v-128q0 -26 -19 -45
+t-45 -19h-1664q-26 0 -45 19t-19 45v128q0 26 19 45t45 19h1664q26 0 45 -19t19 -45zM1792 1344v-128q0 -26 -19 -45t-45 -19h-1664q-26 0 -45 19t-19 45v128q0 26 19 45t45 19h1664q26 0 45 -19t19 -45z" />
+    <glyph glyph-name="list" unicode="&#xf03a;" horiz-adv-x="1792" 
+d="M256 224v-192q0 -13 -9.5 -22.5t-22.5 -9.5h-192q-13 0 -22.5 9.5t-9.5 22.5v192q0 13 9.5 22.5t22.5 9.5h192q13 0 22.5 -9.5t9.5 -22.5zM256 608v-192q0 -13 -9.5 -22.5t-22.5 -9.5h-192q-13 0 -22.5 9.5t-9.5 22.5v192q0 13 9.5 22.5t22.5 9.5h192q13 0 22.5 -9.5
+t9.5 -22.5zM256 992v-192q0 -13 -9.5 -22.5t-22.5 -9.5h-192q-13 0 -22.5 9.5t-9.5 22.5v192q0 13 9.5 22.5t22.5 9.5h192q13 0 22.5 -9.5t9.5 -22.5zM1792 224v-192q0 -13 -9.5 -22.5t-22.5 -9.5h-1344q-13 0 -22.5 9.5t-9.5 22.5v192q0 13 9.5 22.5t22.5 9.5h1344
+q13 0 22.5 -9.5t9.5 -22.5zM256 1376v-192q0 -13 -9.5 -22.5t-22.5 -9.5h-192q-13 0 -22.5 9.5t-9.5 22.5v192q0 13 9.5 22.5t22.5 9.5h192q13 0 22.5 -9.5t9.5 -22.5zM1792 608v-192q0 -13 -9.5 -22.5t-22.5 -9.5h-1344q-13 0 -22.5 9.5t-9.5 22.5v192q0 13 9.5 22.5
+t22.5 9.5h1344q13 0 22.5 -9.5t9.5 -22.5zM1792 992v-192q0 -13 -9.5 -22.5t-22.5 -9.5h-1344q-13 0 -22.5 9.5t-9.5 22.5v192q0 13 9.5 22.5t22.5 9.5h1344q13 0 22.5 -9.5t9.5 -22.5zM1792 1376v-192q0 -13 -9.5 -22.5t-22.5 -9.5h-1344q-13 0 -22.5 9.5t-9.5 22.5v192
+q0 13 9.5 22.5t22.5 9.5h1344q13 0 22.5 -9.5t9.5 -22.5z" />
+    <glyph glyph-name="indent_left" unicode="&#xf03b;" horiz-adv-x="1792" 
+d="M384 992v-576q0 -13 -9.5 -22.5t-22.5 -9.5q-14 0 -23 9l-288 288q-9 9 -9 23t9 23l288 288q9 9 23 9q13 0 22.5 -9.5t9.5 -22.5zM1792 224v-192q0 -13 -9.5 -22.5t-22.5 -9.5h-1728q-13 0 -22.5 9.5t-9.5 22.5v192q0 13 9.5 22.5t22.5 9.5h1728q13 0 22.5 -9.5
+t9.5 -22.5zM1792 608v-192q0 -13 -9.5 -22.5t-22.5 -9.5h-1088q-13 0 -22.5 9.5t-9.5 22.5v192q0 13 9.5 22.5t22.5 9.5h1088q13 0 22.5 -9.5t9.5 -22.5zM1792 992v-192q0 -13 -9.5 -22.5t-22.5 -9.5h-1088q-13 0 -22.5 9.5t-9.5 22.5v192q0 13 9.5 22.5t22.5 9.5h1088
+q13 0 22.5 -9.5t9.5 -22.5zM1792 1376v-192q0 -13 -9.5 -22.5t-22.5 -9.5h-1728q-13 0 -22.5 9.5t-9.5 22.5v192q0 13 9.5 22.5t22.5 9.5h1728q13 0 22.5 -9.5t9.5 -22.5z" />
+    <glyph glyph-name="indent_right" unicode="&#xf03c;" horiz-adv-x="1792" 
+d="M352 704q0 -14 -9 -23l-288 -288q-9 -9 -23 -9q-13 0 -22.5 9.5t-9.5 22.5v576q0 13 9.5 22.5t22.5 9.5q14 0 23 -9l288 -288q9 -9 9 -23zM1792 224v-192q0 -13 -9.5 -22.5t-22.5 -9.5h-1728q-13 0 -22.5 9.5t-9.5 22.5v192q0 13 9.5 22.5t22.5 9.5h1728q13 0 22.5 -9.5
+t9.5 -22.5zM1792 608v-192q0 -13 -9.5 -22.5t-22.5 -9.5h-1088q-13 0 -22.5 9.5t-9.5 22.5v192q0 13 9.5 22.5t22.5 9.5h1088q13 0 22.5 -9.5t9.5 -22.5zM1792 992v-192q0 -13 -9.5 -22.5t-22.5 -9.5h-1088q-13 0 -22.5 9.5t-9.5 22.5v192q0 13 9.5 22.5t22.5 9.5h1088
+q13 0 22.5 -9.5t9.5 -22.5zM1792 1376v-192q0 -13 -9.5 -22.5t-22.5 -9.5h-1728q-13 0 -22.5 9.5t-9.5 22.5v192q0 13 9.5 22.5t22.5 9.5h1728q13 0 22.5 -9.5t9.5 -22.5z" />
+    <glyph glyph-name="facetime_video" unicode="&#xf03d;" horiz-adv-x="1792" 
+d="M1792 1184v-1088q0 -42 -39 -59q-13 -5 -25 -5q-27 0 -45 19l-403 403v-166q0 -119 -84.5 -203.5t-203.5 -84.5h-704q-119 0 -203.5 84.5t-84.5 203.5v704q0 119 84.5 203.5t203.5 84.5h704q119 0 203.5 -84.5t84.5 -203.5v-165l403 402q18 19 45 19q12 0 25 -5
+q39 -17 39 -59z" />
+    <glyph glyph-name="picture" unicode="&#xf03e;" horiz-adv-x="1920" 
+d="M640 960q0 -80 -56 -136t-136 -56t-136 56t-56 136t56 136t136 56t136 -56t56 -136zM1664 576v-448h-1408v192l320 320l160 -160l512 512zM1760 1280h-1600q-13 0 -22.5 -9.5t-9.5 -22.5v-1216q0 -13 9.5 -22.5t22.5 -9.5h1600q13 0 22.5 9.5t9.5 22.5v1216
+q0 13 -9.5 22.5t-22.5 9.5zM1920 1248v-1216q0 -66 -47 -113t-113 -47h-1600q-66 0 -113 47t-47 113v1216q0 66 47 113t113 47h1600q66 0 113 -47t47 -113z" />
+    <glyph glyph-name="pencil" unicode="&#xf040;" 
+d="M363 0l91 91l-235 235l-91 -91v-107h128v-128h107zM886 928q0 22 -22 22q-10 0 -17 -7l-542 -542q-7 -7 -7 -17q0 -22 22 -22q10 0 17 7l542 542q7 7 7 17zM832 1120l416 -416l-832 -832h-416v416zM1515 1024q0 -53 -37 -90l-166 -166l-416 416l166 165q36 38 90 38
+q53 0 91 -38l235 -234q37 -39 37 -91z" />
+    <glyph glyph-name="map_marker" unicode="&#xf041;" horiz-adv-x="1024" 
+d="M768 896q0 106 -75 181t-181 75t-181 -75t-75 -181t75 -181t181 -75t181 75t75 181zM1024 896q0 -109 -33 -179l-364 -774q-16 -33 -47.5 -52t-67.5 -19t-67.5 19t-46.5 52l-365 774q-33 70 -33 179q0 212 150 362t362 150t362 -150t150 -362z" />
+    <glyph glyph-name="adjust" unicode="&#xf042;" 
+d="M768 96v1088q-148 0 -273 -73t-198 -198t-73 -273t73 -273t198 -198t273 -73zM1536 640q0 -209 -103 -385.5t-279.5 -279.5t-385.5 -103t-385.5 103t-279.5 279.5t-103 385.5t103 385.5t279.5 279.5t385.5 103t385.5 -103t279.5 -279.5t103 -385.5z" />
+    <glyph glyph-name="tint" unicode="&#xf043;" horiz-adv-x="1024" 
+d="M512 384q0 36 -20 69q-1 1 -15.5 22.5t-25.5 38t-25 44t-21 50.5q-4 16 -21 16t-21 -16q-7 -23 -21 -50.5t-25 -44t-25.5 -38t-15.5 -22.5q-20 -33 -20 -69q0 -53 37.5 -90.5t90.5 -37.5t90.5 37.5t37.5 90.5zM1024 512q0 -212 -150 -362t-362 -150t-362 150t-150 362
+q0 145 81 275q6 9 62.5 90.5t101 151t99.5 178t83 201.5q9 30 34 47t51 17t51.5 -17t33.5 -47q28 -93 83 -201.5t99.5 -178t101 -151t62.5 -90.5q81 -127 81 -275z" />
+    <glyph glyph-name="edit" unicode="&#xf044;" horiz-adv-x="1792" 
+d="M888 352l116 116l-152 152l-116 -116v-56h96v-96h56zM1328 1072q-16 16 -33 -1l-350 -350q-17 -17 -1 -33t33 1l350 350q17 17 1 33zM1408 478v-190q0 -119 -84.5 -203.5t-203.5 -84.5h-832q-119 0 -203.5 84.5t-84.5 203.5v832q0 119 84.5 203.5t203.5 84.5h832
+q63 0 117 -25q15 -7 18 -23q3 -17 -9 -29l-49 -49q-14 -14 -32 -8q-23 6 -45 6h-832q-66 0 -113 -47t-47 -113v-832q0 -66 47 -113t113 -47h832q66 0 113 47t47 113v126q0 13 9 22l64 64q15 15 35 7t20 -29zM1312 1216l288 -288l-672 -672h-288v288zM1756 1084l-92 -92
+l-288 288l92 92q28 28 68 28t68 -28l152 -152q28 -28 28 -68t-28 -68z" />
+    <glyph glyph-name="share" unicode="&#xf045;" horiz-adv-x="1664" 
+d="M1408 547v-259q0 -119 -84.5 -203.5t-203.5 -84.5h-832q-119 0 -203.5 84.5t-84.5 203.5v832q0 119 84.5 203.5t203.5 84.5h255v0q13 0 22.5 -9.5t9.5 -22.5q0 -27 -26 -32q-77 -26 -133 -60q-10 -4 -16 -4h-112q-66 0 -113 -47t-47 -113v-832q0 -66 47 -113t113 -47h832
+q66 0 113 47t47 113v214q0 19 18 29q28 13 54 37q16 16 35 8q21 -9 21 -29zM1645 1043l-384 -384q-18 -19 -45 -19q-12 0 -25 5q-39 17 -39 59v192h-160q-323 0 -438 -131q-119 -137 -74 -473q3 -23 -20 -34q-8 -2 -12 -2q-16 0 -26 13q-10 14 -21 31t-39.5 68.5t-49.5 99.5
+t-38.5 114t-17.5 122q0 49 3.5 91t14 90t28 88t47 81.5t68.5 74t94.5 61.5t124.5 48.5t159.5 30.5t196.5 11h160v192q0 42 39 59q13 5 25 5q26 0 45 -19l384 -384q19 -19 19 -45t-19 -45z" />
+    <glyph glyph-name="check" unicode="&#xf046;" horiz-adv-x="1664" 
+d="M1408 606v-318q0 -119 -84.5 -203.5t-203.5 -84.5h-832q-119 0 -203.5 84.5t-84.5 203.5v832q0 119 84.5 203.5t203.5 84.5h832q63 0 117 -25q15 -7 18 -23q3 -17 -9 -29l-49 -49q-10 -10 -23 -10q-3 0 -9 2q-23 6 -45 6h-832q-66 0 -113 -47t-47 -113v-832
+q0 -66 47 -113t113 -47h832q66 0 113 47t47 113v254q0 13 9 22l64 64q10 10 23 10q6 0 12 -3q20 -8 20 -29zM1639 1095l-814 -814q-24 -24 -57 -24t-57 24l-430 430q-24 24 -24 57t24 57l110 110q24 24 57 24t57 -24l263 -263l647 647q24 24 57 24t57 -24l110 -110
+q24 -24 24 -57t-24 -57z" />
+    <glyph glyph-name="move" unicode="&#xf047;" horiz-adv-x="1792" 
+d="M1792 640q0 -26 -19 -45l-256 -256q-19 -19 -45 -19t-45 19t-19 45v128h-384v-384h128q26 0 45 -19t19 -45t-19 -45l-256 -256q-19 -19 -45 -19t-45 19l-256 256q-19 19 -19 45t19 45t45 19h128v384h-384v-128q0 -26 -19 -45t-45 -19t-45 19l-256 256q-19 19 -19 45
+t19 45l256 256q19 19 45 19t45 -19t19 -45v-128h384v384h-128q-26 0 -45 19t-19 45t19 45l256 256q19 19 45 19t45 -19l256 -256q19 -19 19 -45t-19 -45t-45 -19h-128v-384h384v128q0 26 19 45t45 19t45 -19l256 -256q19 -19 19 -45z" />
+    <glyph glyph-name="step_backward" unicode="&#xf048;" horiz-adv-x="1024" 
+d="M979 1395q19 19 32 13t13 -32v-1472q0 -26 -13 -32t-32 13l-710 710q-9 9 -13 19v-678q0 -26 -19 -45t-45 -19h-128q-26 0 -45 19t-19 45v1408q0 26 19 45t45 19h128q26 0 45 -19t19 -45v-678q4 10 13 19z" />
+    <glyph glyph-name="fast_backward" unicode="&#xf049;" horiz-adv-x="1792" 
+d="M1747 1395q19 19 32 13t13 -32v-1472q0 -26 -13 -32t-32 13l-710 710q-9 9 -13 19v-710q0 -26 -13 -32t-32 13l-710 710q-9 9 -13 19v-678q0 -26 -19 -45t-45 -19h-128q-26 0 -45 19t-19 45v1408q0 26 19 45t45 19h128q26 0 45 -19t19 -45v-678q4 10 13 19l710 710
+q19 19 32 13t13 -32v-710q4 10 13 19z" />
+    <glyph glyph-name="backward" unicode="&#xf04a;" horiz-adv-x="1664" 
+d="M1619 1395q19 19 32 13t13 -32v-1472q0 -26 -13 -32t-32 13l-710 710q-9 9 -13 19v-710q0 -26 -13 -32t-32 13l-710 710q-19 19 -19 45t19 45l710 710q19 19 32 13t13 -32v-710q4 10 13 19z" />
+    <glyph glyph-name="play" unicode="&#xf04b;" horiz-adv-x="1408" 
+d="M1384 609l-1328 -738q-23 -13 -39.5 -3t-16.5 36v1472q0 26 16.5 36t39.5 -3l1328 -738q23 -13 23 -31t-23 -31z" />
+    <glyph glyph-name="pause" unicode="&#xf04c;" 
+d="M1536 1344v-1408q0 -26 -19 -45t-45 -19h-512q-26 0 -45 19t-19 45v1408q0 26 19 45t45 19h512q26 0 45 -19t19 -45zM640 1344v-1408q0 -26 -19 -45t-45 -19h-512q-26 0 -45 19t-19 45v1408q0 26 19 45t45 19h512q26 0 45 -19t19 -45z" />
+    <glyph glyph-name="stop" unicode="&#xf04d;" 
+d="M1536 1344v-1408q0 -26 -19 -45t-45 -19h-1408q-26 0 -45 19t-19 45v1408q0 26 19 45t45 19h1408q26 0 45 -19t19 -45z" />
+    <glyph glyph-name="forward" unicode="&#xf04e;" horiz-adv-x="1664" 
+d="M45 -115q-19 -19 -32 -13t-13 32v1472q0 26 13 32t32 -13l710 -710q9 -9 13 -19v710q0 26 13 32t32 -13l710 -710q19 -19 19 -45t-19 -45l-710 -710q-19 -19 -32 -13t-13 32v710q-4 -10 -13 -19z" />
+    <glyph glyph-name="fast_forward" unicode="&#xf050;" horiz-adv-x="1792" 
+d="M45 -115q-19 -19 -32 -13t-13 32v1472q0 26 13 32t32 -13l710 -710q9 -9 13 -19v710q0 26 13 32t32 -13l710 -710q9 -9 13 -19v678q0 26 19 45t45 19h128q26 0 45 -19t19 -45v-1408q0 -26 -19 -45t-45 -19h-128q-26 0 -45 19t-19 45v678q-4 -10 -13 -19l-710 -710
+q-19 -19 -32 -13t-13 32v710q-4 -10 -13 -19z" />
+    <glyph glyph-name="step_forward" unicode="&#xf051;" horiz-adv-x="1024" 
+d="M45 -115q-19 -19 -32 -13t-13 32v1472q0 26 13 32t32 -13l710 -710q9 -9 13 -19v678q0 26 19 45t45 19h128q26 0 45 -19t19 -45v-1408q0 -26 -19 -45t-45 -19h-128q-26 0 -45 19t-19 45v678q-4 -10 -13 -19z" />
+    <glyph glyph-name="eject" unicode="&#xf052;" horiz-adv-x="1538" 
+d="M14 557l710 710q19 19 45 19t45 -19l710 -710q19 -19 13 -32t-32 -13h-1472q-26 0 -32 13t13 32zM1473 0h-1408q-26 0 -45 19t-19 45v256q0 26 19 45t45 19h1408q26 0 45 -19t19 -45v-256q0 -26 -19 -45t-45 -19z" />
+    <glyph glyph-name="chevron_left" unicode="&#xf053;" horiz-adv-x="1280" 
+d="M1171 1235l-531 -531l531 -531q19 -19 19 -45t-19 -45l-166 -166q-19 -19 -45 -19t-45 19l-742 742q-19 19 -19 45t19 45l742 742q19 19 45 19t45 -19l166 -166q19 -19 19 -45t-19 -45z" />
+    <glyph glyph-name="chevron_right" unicode="&#xf054;" horiz-adv-x="1280" 
+d="M1107 659l-742 -742q-19 -19 -45 -19t-45 19l-166 166q-19 19 -19 45t19 45l531 531l-531 531q-19 19 -19 45t19 45l166 166q19 19 45 19t45 -19l742 -742q19 -19 19 -45t-19 -45z" />
+    <glyph glyph-name="plus_sign" unicode="&#xf055;" 
+d="M1216 576v128q0 26 -19 45t-45 19h-256v256q0 26 -19 45t-45 19h-128q-26 0 -45 -19t-19 -45v-256h-256q-26 0 -45 -19t-19 -45v-128q0 -26 19 -45t45 -19h256v-256q0 -26 19 -45t45 -19h128q26 0 45 19t19 45v256h256q26 0 45 19t19 45zM1536 640q0 -209 -103 -385.5
+t-279.5 -279.5t-385.5 -103t-385.5 103t-279.5 279.5t-103 385.5t103 385.5t279.5 279.5t385.5 103t385.5 -103t279.5 -279.5t103 -385.5z" />
+    <glyph glyph-name="minus_sign" unicode="&#xf056;" 
+d="M1216 576v128q0 26 -19 45t-45 19h-768q-26 0 -45 -19t-19 -45v-128q0 -26 19 -45t45 -19h768q26 0 45 19t19 45zM1536 640q0 -209 -103 -385.5t-279.5 -279.5t-385.5 -103t-385.5 103t-279.5 279.5t-103 385.5t103 385.5t279.5 279.5t385.5 103t385.5 -103t279.5 -279.5
+t103 -385.5z" />
+    <glyph glyph-name="remove_sign" unicode="&#xf057;" 
+d="M1149 414q0 26 -19 45l-181 181l181 181q19 19 19 45q0 27 -19 46l-90 90q-19 19 -46 19q-26 0 -45 -19l-181 -181l-181 181q-19 19 -45 19q-27 0 -46 -19l-90 -90q-19 -19 -19 -46q0 -26 19 -45l181 -181l-181 -181q-19 -19 -19 -45q0 -27 19 -46l90 -90q19 -19 46 -19
+q26 0 45 19l181 181l181 -181q19 -19 45 -19q27 0 46 19l90 90q19 19 19 46zM1536 640q0 -209 -103 -385.5t-279.5 -279.5t-385.5 -103t-385.5 103t-279.5 279.5t-103 385.5t103 385.5t279.5 279.5t385.5 103t385.5 -103t279.5 -279.5t103 -385.5z" />
+    <glyph glyph-name="ok_sign" unicode="&#xf058;" 
+d="M1284 802q0 28 -18 46l-91 90q-19 19 -45 19t-45 -19l-408 -407l-226 226q-19 19 -45 19t-45 -19l-91 -90q-18 -18 -18 -46q0 -27 18 -45l362 -362q19 -19 45 -19q27 0 46 19l543 543q18 18 18 45zM1536 640q0 -209 -103 -385.5t-279.5 -279.5t-385.5 -103t-385.5 103
+t-279.5 279.5t-103 385.5t103 385.5t279.5 279.5t385.5 103t385.5 -103t279.5 -279.5t103 -385.5z" />
+    <glyph glyph-name="question_sign" unicode="&#xf059;" 
+d="M896 160v192q0 14 -9 23t-23 9h-192q-14 0 -23 -9t-9 -23v-192q0 -14 9 -23t23 -9h192q14 0 23 9t9 23zM1152 832q0 88 -55.5 163t-138.5 116t-170 41q-243 0 -371 -213q-15 -24 8 -42l132 -100q7 -6 19 -6q16 0 25 12q53 68 86 92q34 24 86 24q48 0 85.5 -26t37.5 -59
+q0 -38 -20 -61t-68 -45q-63 -28 -115.5 -86.5t-52.5 -125.5v-36q0 -14 9 -23t23 -9h192q14 0 23 9t9 23q0 19 21.5 49.5t54.5 49.5q32 18 49 28.5t46 35t44.5 48t28 60.5t12.5 81zM1536 640q0 -209 -103 -385.5t-279.5 -279.5t-385.5 -103t-385.5 103t-279.5 279.5
+t-103 385.5t103 385.5t279.5 279.5t385.5 103t385.5 -103t279.5 -279.5t103 -385.5z" />
+    <glyph glyph-name="info_sign" unicode="&#xf05a;" 
+d="M1024 160v160q0 14 -9 23t-23 9h-96v512q0 14 -9 23t-23 9h-320q-14 0 -23 -9t-9 -23v-160q0 -14 9 -23t23 -9h96v-320h-96q-14 0 -23 -9t-9 -23v-160q0 -14 9 -23t23 -9h448q14 0 23 9t9 23zM896 1056v160q0 14 -9 23t-23 9h-192q-14 0 -23 -9t-9 -23v-160q0 -14 9 -23
+t23 -9h192q14 0 23 9t9 23zM1536 640q0 -209 -103 -385.5t-279.5 -279.5t-385.5 -103t-385.5 103t-279.5 279.5t-103 385.5t103 385.5t279.5 279.5t385.5 103t385.5 -103t279.5 -279.5t103 -385.5z" />
+    <glyph glyph-name="screenshot" unicode="&#xf05b;" 
+d="M1197 512h-109q-26 0 -45 19t-19 45v128q0 26 19 45t45 19h109q-32 108 -112.5 188.5t-188.5 112.5v-109q0 -26 -19 -45t-45 -19h-128q-26 0 -45 19t-19 45v109q-108 -32 -188.5 -112.5t-112.5 -188.5h109q26 0 45 -19t19 -45v-128q0 -26 -19 -45t-45 -19h-109
+q32 -108 112.5 -188.5t188.5 -112.5v109q0 26 19 45t45 19h128q26 0 45 -19t19 -45v-109q108 32 188.5 112.5t112.5 188.5zM1536 704v-128q0 -26 -19 -45t-45 -19h-143q-37 -161 -154.5 -278.5t-278.5 -154.5v-143q0 -26 -19 -45t-45 -19h-128q-26 0 -45 19t-19 45v143
+q-161 37 -278.5 154.5t-154.5 278.5h-143q-26 0 -45 19t-19 45v128q0 26 19 45t45 19h143q37 161 154.5 278.5t278.5 154.5v143q0 26 19 45t45 19h128q26 0 45 -19t19 -45v-143q161 -37 278.5 -154.5t154.5 -278.5h143q26 0 45 -19t19 -45z" />
+    <glyph glyph-name="remove_circle" unicode="&#xf05c;" 
+d="M1097 457l-146 -146q-10 -10 -23 -10t-23 10l-137 137l-137 -137q-10 -10 -23 -10t-23 10l-146 146q-10 10 -10 23t10 23l137 137l-137 137q-10 10 -10 23t10 23l146 146q10 10 23 10t23 -10l137 -137l137 137q10 10 23 10t23 -10l146 -146q10 -10 10 -23t-10 -23
+l-137 -137l137 -137q10 -10 10 -23t-10 -23zM1312 640q0 148 -73 273t-198 198t-273 73t-273 -73t-198 -198t-73 -273t73 -273t198 -198t273 -73t273 73t198 198t73 273zM1536 640q0 -209 -103 -385.5t-279.5 -279.5t-385.5 -103t-385.5 103t-279.5 279.5t-103 385.5
+t103 385.5t279.5 279.5t385.5 103t385.5 -103t279.5 -279.5t103 -385.5z" />
+    <glyph glyph-name="ok_circle" unicode="&#xf05d;" 
+d="M1171 723l-422 -422q-19 -19 -45 -19t-45 19l-294 294q-19 19 -19 45t19 45l102 102q19 19 45 19t45 -19l147 -147l275 275q19 19 45 19t45 -19l102 -102q19 -19 19 -45t-19 -45zM1312 640q0 148 -73 273t-198 198t-273 73t-273 -73t-198 -198t-73 -273t73 -273t198 -198
+t273 -73t273 73t198 198t73 273zM1536 640q0 -209 -103 -385.5t-279.5 -279.5t-385.5 -103t-385.5 103t-279.5 279.5t-103 385.5t103 385.5t279.5 279.5t385.5 103t385.5 -103t279.5 -279.5t103 -385.5z" />
+    <glyph glyph-name="ban_circle" unicode="&#xf05e;" 
+d="M1312 643q0 161 -87 295l-754 -753q137 -89 297 -89q111 0 211.5 43.5t173.5 116.5t116 174.5t43 212.5zM313 344l755 754q-135 91 -300 91q-148 0 -273 -73t-198 -199t-73 -274q0 -162 89 -299zM1536 643q0 -157 -61 -300t-163.5 -246t-245 -164t-298.5 -61t-298.5 61
+t-245 164t-163.5 246t-61 300t61 299.5t163.5 245.5t245 164t298.5 61t298.5 -61t245 -164t163.5 -245.5t61 -299.5z" />
+    <glyph glyph-name="arrow_left" unicode="&#xf060;" 
+d="M1536 640v-128q0 -53 -32.5 -90.5t-84.5 -37.5h-704l293 -294q38 -36 38 -90t-38 -90l-75 -76q-37 -37 -90 -37q-52 0 -91 37l-651 652q-37 37 -37 90q0 52 37 91l651 650q38 38 91 38q52 0 90 -38l75 -74q38 -38 38 -91t-38 -91l-293 -293h704q52 0 84.5 -37.5
+t32.5 -90.5z" />
+    <glyph glyph-name="arrow_right" unicode="&#xf061;" 
+d="M1472 576q0 -54 -37 -91l-651 -651q-39 -37 -91 -37q-51 0 -90 37l-75 75q-38 38 -38 91t38 91l293 293h-704q-52 0 -84.5 37.5t-32.5 90.5v128q0 53 32.5 90.5t84.5 37.5h704l-293 294q-38 36 -38 90t38 90l75 75q38 38 90 38q53 0 91 -38l651 -651q37 -35 37 -90z" />
+    <glyph glyph-name="arrow_up" unicode="&#xf062;" horiz-adv-x="1664" 
+d="M1611 565q0 -51 -37 -90l-75 -75q-38 -38 -91 -38q-54 0 -90 38l-294 293v-704q0 -52 -37.5 -84.5t-90.5 -32.5h-128q-53 0 -90.5 32.5t-37.5 84.5v704l-294 -293q-36 -38 -90 -38t-90 38l-75 75q-38 38 -38 90q0 53 38 91l651 651q35 37 90 37q54 0 91 -37l651 -651
+q37 -39 37 -91z" />
+    <glyph glyph-name="arrow_down" unicode="&#xf063;" horiz-adv-x="1664" 
+d="M1611 704q0 -53 -37 -90l-651 -652q-39 -37 -91 -37q-53 0 -90 37l-651 652q-38 36 -38 90q0 53 38 91l74 75q39 37 91 37q53 0 90 -37l294 -294v704q0 52 38 90t90 38h128q52 0 90 -38t38 -90v-704l294 294q37 37 90 37q52 0 91 -37l75 -75q37 -39 37 -91z" />
+    <glyph glyph-name="share_alt" unicode="&#xf064;" horiz-adv-x="1792" 
+d="M1792 896q0 -26 -19 -45l-512 -512q-19 -19 -45 -19t-45 19t-19 45v256h-224q-98 0 -175.5 -6t-154 -21.5t-133 -42.5t-105.5 -69.5t-80 -101t-48.5 -138.5t-17.5 -181q0 -55 5 -123q0 -6 2.5 -23.5t2.5 -26.5q0 -15 -8.5 -25t-23.5 -10q-16 0 -28 17q-7 9 -13 22
+t-13.5 30t-10.5 24q-127 285 -127 451q0 199 53 333q162 403 875 403h224v256q0 26 19 45t45 19t45 -19l512 -512q19 -19 19 -45z" />
+    <glyph glyph-name="resize_full" unicode="&#xf065;" 
+d="M755 480q0 -13 -10 -23l-332 -332l144 -144q19 -19 19 -45t-19 -45t-45 -19h-448q-26 0 -45 19t-19 45v448q0 26 19 45t45 19t45 -19l144 -144l332 332q10 10 23 10t23 -10l114 -114q10 -10 10 -23zM1536 1344v-448q0 -26 -19 -45t-45 -19t-45 19l-144 144l-332 -332
+q-10 -10 -23 -10t-23 10l-114 114q-10 10 -10 23t10 23l332 332l-144 144q-19 19 -19 45t19 45t45 19h448q26 0 45 -19t19 -45z" />
+    <glyph glyph-name="resize_small" unicode="&#xf066;" 
+d="M768 576v-448q0 -26 -19 -45t-45 -19t-45 19l-144 144l-332 -332q-10 -10 -23 -10t-23 10l-114 114q-10 10 -10 23t10 23l332 332l-144 144q-19 19 -19 45t19 45t45 19h448q26 0 45 -19t19 -45zM1523 1248q0 -13 -10 -23l-332 -332l144 -144q19 -19 19 -45t-19 -45
+t-45 -19h-448q-26 0 -45 19t-19 45v448q0 26 19 45t45 19t45 -19l144 -144l332 332q10 10 23 10t23 -10l114 -114q10 -10 10 -23z" />
+    <glyph glyph-name="plus" unicode="&#xf067;" horiz-adv-x="1408" 
+d="M1408 800v-192q0 -40 -28 -68t-68 -28h-416v-416q0 -40 -28 -68t-68 -28h-192q-40 0 -68 28t-28 68v416h-416q-40 0 -68 28t-28 68v192q0 40 28 68t68 28h416v416q0 40 28 68t68 28h192q40 0 68 -28t28 -68v-416h416q40 0 68 -28t28 -68z" />
+    <glyph glyph-name="minus" unicode="&#xf068;" horiz-adv-x="1408" 
+d="M1408 800v-192q0 -40 -28 -68t-68 -28h-1216q-40 0 -68 28t-28 68v192q0 40 28 68t68 28h1216q40 0 68 -28t28 -68z" />
+    <glyph glyph-name="asterisk" unicode="&#xf069;" horiz-adv-x="1664" 
+d="M1482 486q46 -26 59.5 -77.5t-12.5 -97.5l-64 -110q-26 -46 -77.5 -59.5t-97.5 12.5l-266 153v-307q0 -52 -38 -90t-90 -38h-128q-52 0 -90 38t-38 90v307l-266 -153q-46 -26 -97.5 -12.5t-77.5 59.5l-64 110q-26 46 -12.5 97.5t59.5 77.5l266 154l-266 154
+q-46 26 -59.5 77.5t12.5 97.5l64 110q26 46 77.5 59.5t97.5 -12.5l266 -153v307q0 52 38 90t90 38h128q52 0 90 -38t38 -90v-307l266 153q46 26 97.5 12.5t77.5 -59.5l64 -110q26 -46 12.5 -97.5t-59.5 -77.5l-266 -154z" />
+    <glyph glyph-name="exclamation_sign" unicode="&#xf06a;" 
+d="M768 1408q209 0 385.5 -103t279.5 -279.5t103 -385.5t-103 -385.5t-279.5 -279.5t-385.5 -103t-385.5 103t-279.5 279.5t-103 385.5t103 385.5t279.5 279.5t385.5 103zM896 161v190q0 14 -9 23.5t-22 9.5h-192q-13 0 -23 -10t-10 -23v-190q0 -13 10 -23t23 -10h192
+q13 0 22 9.5t9 23.5zM894 505l18 621q0 12 -10 18q-10 8 -24 8h-220q-14 0 -24 -8q-10 -6 -10 -18l17 -621q0 -10 10 -17.5t24 -7.5h185q14 0 23.5 7.5t10.5 17.5z" />
+    <glyph glyph-name="gift" unicode="&#xf06b;" 
+d="M928 180v56v468v192h-320v-192v-468v-56q0 -25 18 -38.5t46 -13.5h192q28 0 46 13.5t18 38.5zM472 1024h195l-126 161q-26 31 -69 31q-40 0 -68 -28t-28 -68t28 -68t68 -28zM1160 1120q0 40 -28 68t-68 28q-43 0 -69 -31l-125 -161h194q40 0 68 28t28 68zM1536 864v-320
+q0 -14 -9 -23t-23 -9h-96v-416q0 -40 -28 -68t-68 -28h-1088q-40 0 -68 28t-28 68v416h-96q-14 0 -23 9t-9 23v320q0 14 9 23t23 9h440q-93 0 -158.5 65.5t-65.5 158.5t65.5 158.5t158.5 65.5q107 0 168 -77l128 -165l128 165q61 77 168 77q93 0 158.5 -65.5t65.5 -158.5
+t-65.5 -158.5t-158.5 -65.5h440q14 0 23 -9t9 -23z" />
+    <glyph glyph-name="leaf" unicode="&#xf06c;" horiz-adv-x="1792" 
+d="M1280 832q0 26 -19 45t-45 19q-172 0 -318 -49.5t-259.5 -134t-235.5 -219.5q-19 -21 -19 -45q0 -26 19 -45t45 -19q24 0 45 19q27 24 74 71t67 66q137 124 268.5 176t313.5 52q26 0 45 19t19 45zM1792 1030q0 -95 -20 -193q-46 -224 -184.5 -383t-357.5 -268
+q-214 -108 -438 -108q-148 0 -286 47q-15 5 -88 42t-96 37q-16 0 -39.5 -32t-45 -70t-52.5 -70t-60 -32q-43 0 -63.5 17.5t-45.5 59.5q-2 4 -6 11t-5.5 10t-3 9.5t-1.5 13.5q0 35 31 73.5t68 65.5t68 56t31 48q0 4 -14 38t-16 44q-9 51 -9 104q0 115 43.5 220t119 184.5
+t170.5 139t204 95.5q55 18 145 25.5t179.5 9t178.5 6t163.5 24t113.5 56.5l29.5 29.5t29.5 28t27 20t36.5 16t43.5 4.5q39 0 70.5 -46t47.5 -112t24 -124t8 -96z" />
+    <glyph glyph-name="fire" unicode="&#xf06d;" horiz-adv-x="1408" 
+d="M1408 -160v-64q0 -13 -9.5 -22.5t-22.5 -9.5h-1344q-13 0 -22.5 9.5t-9.5 22.5v64q0 13 9.5 22.5t22.5 9.5h1344q13 0 22.5 -9.5t9.5 -22.5zM1152 896q0 -78 -24.5 -144t-64 -112.5t-87.5 -88t-96 -77.5t-87.5 -72t-64 -81.5t-24.5 -96.5q0 -96 67 -224l-4 1l1 -1
+q-90 41 -160 83t-138.5 100t-113.5 122.5t-72.5 150.5t-27.5 184q0 78 24.5 144t64 112.5t87.5 88t96 77.5t87.5 72t64 81.5t24.5 96.5q0 94 -66 224l3 -1l-1 1q90 -41 160 -83t138.5 -100t113.5 -122.5t72.5 -150.5t27.5 -184z" />
+    <glyph glyph-name="eye_open" unicode="&#xf06e;" horiz-adv-x="1792" 
+d="M1664 576q-152 236 -381 353q61 -104 61 -225q0 -185 -131.5 -316.5t-316.5 -131.5t-316.5 131.5t-131.5 316.5q0 121 61 225q-229 -117 -381 -353q133 -205 333.5 -326.5t434.5 -121.5t434.5 121.5t333.5 326.5zM944 960q0 20 -14 34t-34 14q-125 0 -214.5 -89.5
+t-89.5 -214.5q0 -20 14 -34t34 -14t34 14t14 34q0 86 61 147t147 61q20 0 34 14t14 34zM1792 576q0 -34 -20 -69q-140 -230 -376.5 -368.5t-499.5 -138.5t-499.5 139t-376.5 368q-20 35 -20 69t20 69q140 229 376.5 368t499.5 139t499.5 -139t376.5 -368q20 -35 20 -69z" />
+    <glyph glyph-name="eye_close" unicode="&#xf070;" horiz-adv-x="1792" 
+d="M555 201l78 141q-87 63 -136 159t-49 203q0 121 61 225q-229 -117 -381 -353q167 -258 427 -375zM944 960q0 20 -14 34t-34 14q-125 0 -214.5 -89.5t-89.5 -214.5q0 -20 14 -34t34 -14t34 14t14 34q0 86 61 147t147 61q20 0 34 14t14 34zM1307 1151q0 -7 -1 -9
+q-106 -189 -316 -567t-315 -566l-49 -89q-10 -16 -28 -16q-12 0 -134 70q-16 10 -16 28q0 12 44 87q-143 65 -263.5 173t-208.5 245q-20 31 -20 69t20 69q153 235 380 371t496 136q89 0 180 -17l54 97q10 16 28 16q5 0 18 -6t31 -15.5t33 -18.5t31.5 -18.5t19.5 -11.5
+q16 -10 16 -27zM1344 704q0 -139 -79 -253.5t-209 -164.5l280 502q8 -45 8 -84zM1792 576q0 -35 -20 -69q-39 -64 -109 -145q-150 -172 -347.5 -267t-419.5 -95l74 132q212 18 392.5 137t301.5 307q-115 179 -282 294l63 112q95 -64 182.5 -153t144.5 -184q20 -34 20 -69z
+" />
+    <glyph glyph-name="warning_sign" unicode="&#xf071;" horiz-adv-x="1792" 
+d="M1024 161v190q0 14 -9.5 23.5t-22.5 9.5h-192q-13 0 -22.5 -9.5t-9.5 -23.5v-190q0 -14 9.5 -23.5t22.5 -9.5h192q13 0 22.5 9.5t9.5 23.5zM1022 535l18 459q0 12 -10 19q-13 11 -24 11h-220q-11 0 -24 -11q-10 -7 -10 -21l17 -457q0 -10 10 -16.5t24 -6.5h185
+q14 0 23.5 6.5t10.5 16.5zM1008 1469l768 -1408q35 -63 -2 -126q-17 -29 -46.5 -46t-63.5 -17h-1536q-34 0 -63.5 17t-46.5 46q-37 63 -2 126l768 1408q17 31 47 49t65 18t65 -18t47 -49z" />
+    <glyph glyph-name="plane" unicode="&#xf072;" horiz-adv-x="1408" 
+d="M1376 1376q44 -52 12 -148t-108 -172l-161 -161l160 -696q5 -19 -12 -33l-128 -96q-7 -6 -19 -6q-4 0 -7 1q-15 3 -21 16l-279 508l-259 -259l53 -194q5 -17 -8 -31l-96 -96q-9 -9 -23 -9h-2q-15 2 -24 13l-189 252l-252 189q-11 7 -13 23q-1 13 9 25l96 97q9 9 23 9
+q6 0 8 -1l194 -53l259 259l-508 279q-14 8 -17 24q-2 16 9 27l128 128q14 13 30 8l665 -159l160 160q76 76 172 108t148 -12z" />
+    <glyph glyph-name="calendar" unicode="&#xf073;" horiz-adv-x="1664" 
+d="M128 -128h288v288h-288v-288zM480 -128h320v288h-320v-288zM128 224h288v320h-288v-320zM480 224h320v320h-320v-320zM128 608h288v288h-288v-288zM864 -128h320v288h-320v-288zM480 608h320v288h-320v-288zM1248 -128h288v288h-288v-288zM864 224h320v320h-320v-320z
+M512 1088v288q0 13 -9.5 22.5t-22.5 9.5h-64q-13 0 -22.5 -9.5t-9.5 -22.5v-288q0 -13 9.5 -22.5t22.5 -9.5h64q13 0 22.5 9.5t9.5 22.5zM1248 224h288v320h-288v-320zM864 608h320v288h-320v-288zM1248 608h288v288h-288v-288zM1280 1088v288q0 13 -9.5 22.5t-22.5 9.5h-64
+q-13 0 -22.5 -9.5t-9.5 -22.5v-288q0 -13 9.5 -22.5t22.5 -9.5h64q13 0 22.5 9.5t9.5 22.5zM1664 1152v-1280q0 -52 -38 -90t-90 -38h-1408q-52 0 -90 38t-38 90v1280q0 52 38 90t90 38h128v96q0 66 47 113t113 47h64q66 0 113 -47t47 -113v-96h384v96q0 66 47 113t113 47
+h64q66 0 113 -47t47 -113v-96h128q52 0 90 -38t38 -90z" />
+    <glyph glyph-name="random" unicode="&#xf074;" horiz-adv-x="1792" 
+d="M666 1055q-60 -92 -137 -273q-22 45 -37 72.5t-40.5 63.5t-51 56.5t-63 35t-81.5 14.5h-224q-14 0 -23 9t-9 23v192q0 14 9 23t23 9h224q250 0 410 -225zM1792 256q0 -14 -9 -23l-320 -320q-9 -9 -23 -9q-13 0 -22.5 9.5t-9.5 22.5v192q-32 0 -85 -0.5t-81 -1t-73 1
+t-71 5t-64 10.5t-63 18.5t-58 28.5t-59 40t-55 53.5t-56 69.5q59 93 136 273q22 -45 37 -72.5t40.5 -63.5t51 -56.5t63 -35t81.5 -14.5h256v192q0 14 9 23t23 9q12 0 24 -10l319 -319q9 -9 9 -23zM1792 1152q0 -14 -9 -23l-320 -320q-9 -9 -23 -9q-13 0 -22.5 9.5t-9.5 22.5
+v192h-256q-48 0 -87 -15t-69 -45t-51 -61.5t-45 -77.5q-32 -62 -78 -171q-29 -66 -49.5 -111t-54 -105t-64 -100t-74 -83t-90 -68.5t-106.5 -42t-128 -16.5h-224q-14 0 -23 9t-9 23v192q0 14 9 23t23 9h224q48 0 87 15t69 45t51 61.5t45 77.5q32 62 78 171q29 66 49.5 111
+t54 105t64 100t74 83t90 68.5t106.5 42t128 16.5h256v192q0 14 9 23t23 9q12 0 24 -10l319 -319q9 -9 9 -23z" />
+    <glyph glyph-name="comment" unicode="&#xf075;" horiz-adv-x="1792" 
+d="M1792 640q0 -174 -120 -321.5t-326 -233t-450 -85.5q-70 0 -145 8q-198 -175 -460 -242q-49 -14 -114 -22q-17 -2 -30.5 9t-17.5 29v1q-3 4 -0.5 12t2 10t4.5 9.5l6 9t7 8.5t8 9q7 8 31 34.5t34.5 38t31 39.5t32.5 51t27 59t26 76q-157 89 -247.5 220t-90.5 281
+q0 130 71 248.5t191 204.5t286 136.5t348 50.5q244 0 450 -85.5t326 -233t120 -321.5z" />
+    <glyph glyph-name="magnet" unicode="&#xf076;" 
+d="M1536 704v-128q0 -201 -98.5 -362t-274 -251.5t-395.5 -90.5t-395.5 90.5t-274 251.5t-98.5 362v128q0 26 19 45t45 19h384q26 0 45 -19t19 -45v-128q0 -52 23.5 -90t53.5 -57t71 -30t64 -13t44 -2t44 2t64 13t71 30t53.5 57t23.5 90v128q0 26 19 45t45 19h384
+q26 0 45 -19t19 -45zM512 1344v-384q0 -26 -19 -45t-45 -19h-384q-26 0 -45 19t-19 45v384q0 26 19 45t45 19h384q26 0 45 -19t19 -45zM1536 1344v-384q0 -26 -19 -45t-45 -19h-384q-26 0 -45 19t-19 45v384q0 26 19 45t45 19h384q26 0 45 -19t19 -45z" />
+    <glyph glyph-name="chevron_up" unicode="&#xf077;" horiz-adv-x="1792" 
+d="M1683 205l-166 -165q-19 -19 -45 -19t-45 19l-531 531l-531 -531q-19 -19 -45 -19t-45 19l-166 165q-19 19 -19 45.5t19 45.5l742 741q19 19 45 19t45 -19l742 -741q19 -19 19 -45.5t-19 -45.5z" />
+    <glyph glyph-name="chevron_down" unicode="&#xf078;" horiz-adv-x="1792" 
+d="M1683 728l-742 -741q-19 -19 -45 -19t-45 19l-742 741q-19 19 -19 45.5t19 45.5l166 165q19 19 45 19t45 -19l531 -531l531 531q19 19 45 19t45 -19l166 -165q19 -19 19 -45.5t-19 -45.5z" />
+    <glyph glyph-name="retweet" unicode="&#xf079;" horiz-adv-x="1920" 
+d="M1280 32q0 -13 -9.5 -22.5t-22.5 -9.5h-960q-8 0 -13.5 2t-9 7t-5.5 8t-3 11.5t-1 11.5v13v11v160v416h-192q-26 0 -45 19t-19 45q0 24 15 41l320 384q19 22 49 22t49 -22l320 -384q15 -17 15 -41q0 -26 -19 -45t-45 -19h-192v-384h576q16 0 25 -11l160 -192q7 -10 7 -21
+zM1920 448q0 -24 -15 -41l-320 -384q-20 -23 -49 -23t-49 23l-320 384q-15 17 -15 41q0 26 19 45t45 19h192v384h-576q-16 0 -25 12l-160 192q-7 9 -7 20q0 13 9.5 22.5t22.5 9.5h960q8 0 13.5 -2t9 -7t5.5 -8t3 -11.5t1 -11.5v-13v-11v-160v-416h192q26 0 45 -19t19 -45z
+" />
+    <glyph glyph-name="shopping_cart" unicode="&#xf07a;" horiz-adv-x="1664" 
+d="M640 0q0 -52 -38 -90t-90 -38t-90 38t-38 90t38 90t90 38t90 -38t38 -90zM1536 0q0 -52 -38 -90t-90 -38t-90 38t-38 90t38 90t90 38t90 -38t38 -90zM1664 1088v-512q0 -24 -16.5 -42.5t-40.5 -21.5l-1044 -122q13 -60 13 -70q0 -16 -24 -64h920q26 0 45 -19t19 -45
+t-19 -45t-45 -19h-1024q-26 0 -45 19t-19 45q0 11 8 31.5t16 36t21.5 40t15.5 29.5l-177 823h-204q-26 0 -45 19t-19 45t19 45t45 19h256q16 0 28.5 -6.5t19.5 -15.5t13 -24.5t8 -26t5.5 -29.5t4.5 -26h1201q26 0 45 -19t19 -45z" />
+    <glyph glyph-name="folder_close" unicode="&#xf07b;" horiz-adv-x="1664" 
+d="M1664 928v-704q0 -92 -66 -158t-158 -66h-1216q-92 0 -158 66t-66 158v960q0 92 66 158t158 66h320q92 0 158 -66t66 -158v-32h672q92 0 158 -66t66 -158z" />
+    <glyph glyph-name="folder_open" unicode="&#xf07c;" horiz-adv-x="1920" 
+d="M1879 584q0 -31 -31 -66l-336 -396q-43 -51 -120.5 -86.5t-143.5 -35.5h-1088q-34 0 -60.5 13t-26.5 43q0 31 31 66l336 396q43 51 120.5 86.5t143.5 35.5h1088q34 0 60.5 -13t26.5 -43zM1536 928v-160h-832q-94 0 -197 -47.5t-164 -119.5l-337 -396l-5 -6q0 4 -0.5 12.5
+t-0.5 12.5v960q0 92 66 158t158 66h320q92 0 158 -66t66 -158v-32h544q92 0 158 -66t66 -158z" />
+    <glyph glyph-name="resize_vertical" unicode="&#xf07d;" horiz-adv-x="768" 
+d="M704 1216q0 -26 -19 -45t-45 -19h-128v-1024h128q26 0 45 -19t19 -45t-19 -45l-256 -256q-19 -19 -45 -19t-45 19l-256 256q-19 19 -19 45t19 45t45 19h128v1024h-128q-26 0 -45 19t-19 45t19 45l256 256q19 19 45 19t45 -19l256 -256q19 -19 19 -45z" />
+    <glyph glyph-name="resize_horizontal" unicode="&#xf07e;" horiz-adv-x="1792" 
+d="M1792 640q0 -26 -19 -45l-256 -256q-19 -19 -45 -19t-45 19t-19 45v128h-1024v-128q0 -26 -19 -45t-45 -19t-45 19l-256 256q-19 19 -19 45t19 45l256 256q19 19 45 19t45 -19t19 -45v-128h1024v128q0 26 19 45t45 19t45 -19l256 -256q19 -19 19 -45z" />
+    <glyph glyph-name="bar_chart" unicode="&#xf080;" horiz-adv-x="2048" 
+d="M640 640v-512h-256v512h256zM1024 1152v-1024h-256v1024h256zM2048 0v-128h-2048v1536h128v-1408h1920zM1408 896v-768h-256v768h256zM1792 1280v-1152h-256v1152h256z" />
+    <glyph glyph-name="twitter_sign" unicode="&#xf081;" 
+d="M1280 926q-56 -25 -121 -34q68 40 93 117q-65 -38 -134 -51q-61 66 -153 66q-87 0 -148.5 -61.5t-61.5 -148.5q0 -29 5 -48q-129 7 -242 65t-192 155q-29 -50 -29 -106q0 -114 91 -175q-47 1 -100 26v-2q0 -75 50 -133.5t123 -72.5q-29 -8 -51 -8q-13 0 -39 4
+q21 -63 74.5 -104t121.5 -42q-116 -90 -261 -90q-26 0 -50 3q148 -94 322 -94q112 0 210 35.5t168 95t120.5 137t75 162t24.5 168.5q0 18 -1 27q63 45 105 109zM1536 1120v-960q0 -119 -84.5 -203.5t-203.5 -84.5h-960q-119 0 -203.5 84.5t-84.5 203.5v960q0 119 84.5 203.5
+t203.5 84.5h960q119 0 203.5 -84.5t84.5 -203.5z" />
+    <glyph glyph-name="facebook_sign" unicode="&#xf082;" 
+d="M1248 1408q119 0 203.5 -84.5t84.5 -203.5v-960q0 -119 -84.5 -203.5t-203.5 -84.5h-188v595h199l30 232h-229v148q0 56 23.5 84t91.5 28l122 1v207q-63 9 -178 9q-136 0 -217.5 -80t-81.5 -226v-171h-200v-232h200v-595h-532q-119 0 -203.5 84.5t-84.5 203.5v960
+q0 119 84.5 203.5t203.5 84.5h960z" />
+    <glyph glyph-name="camera_retro" unicode="&#xf083;" horiz-adv-x="1792" 
+d="M928 704q0 14 -9 23t-23 9q-66 0 -113 -47t-47 -113q0 -14 9 -23t23 -9t23 9t9 23q0 40 28 68t68 28q14 0 23 9t9 23zM1152 574q0 -106 -75 -181t-181 -75t-181 75t-75 181t75 181t181 75t181 -75t75 -181zM128 0h1536v128h-1536v-128zM1280 574q0 159 -112.5 271.5
+t-271.5 112.5t-271.5 -112.5t-112.5 -271.5t112.5 -271.5t271.5 -112.5t271.5 112.5t112.5 271.5zM256 1216h384v128h-384v-128zM128 1024h1536v118v138h-828l-64 -128h-644v-128zM1792 1280v-1280q0 -53 -37.5 -90.5t-90.5 -37.5h-1536q-53 0 -90.5 37.5t-37.5 90.5v1280
+q0 53 37.5 90.5t90.5 37.5h1536q53 0 90.5 -37.5t37.5 -90.5z" />
+    <glyph glyph-name="key" unicode="&#xf084;" horiz-adv-x="1792" 
+d="M832 1024q0 80 -56 136t-136 56t-136 -56t-56 -136q0 -42 19 -83q-41 19 -83 19q-80 0 -136 -56t-56 -136t56 -136t136 -56t136 56t56 136q0 42 -19 83q41 -19 83 -19q80 0 136 56t56 136zM1683 320q0 -17 -49 -66t-66 -49q-9 0 -28.5 16t-36.5 33t-38.5 40t-24.5 26
+l-96 -96l220 -220q28 -28 28 -68q0 -42 -39 -81t-81 -39q-40 0 -68 28l-671 671q-176 -131 -365 -131q-163 0 -265.5 102.5t-102.5 265.5q0 160 95 313t248 248t313 95q163 0 265.5 -102.5t102.5 -265.5q0 -189 -131 -365l355 -355l96 96q-3 3 -26 24.5t-40 38.5t-33 36.5
+t-16 28.5q0 17 49 66t66 49q13 0 23 -10q6 -6 46 -44.5t82 -79.5t86.5 -86t73 -78t28.5 -41z" />
+    <glyph glyph-name="cogs" unicode="&#xf085;" horiz-adv-x="1920" 
+d="M896 640q0 106 -75 181t-181 75t-181 -75t-75 -181t75 -181t181 -75t181 75t75 181zM1664 128q0 52 -38 90t-90 38t-90 -38t-38 -90q0 -53 37.5 -90.5t90.5 -37.5t90.5 37.5t37.5 90.5zM1664 1152q0 52 -38 90t-90 38t-90 -38t-38 -90q0 -53 37.5 -90.5t90.5 -37.5
+t90.5 37.5t37.5 90.5zM1280 731v-185q0 -10 -7 -19.5t-16 -10.5l-155 -24q-11 -35 -32 -76q34 -48 90 -115q7 -11 7 -20q0 -12 -7 -19q-23 -30 -82.5 -89.5t-78.5 -59.5q-11 0 -21 7l-115 90q-37 -19 -77 -31q-11 -108 -23 -155q-7 -24 -30 -24h-186q-11 0 -20 7.5t-10 17.5
+l-23 153q-34 10 -75 31l-118 -89q-7 -7 -20 -7q-11 0 -21 8q-144 133 -144 160q0 9 7 19q10 14 41 53t47 61q-23 44 -35 82l-152 24q-10 1 -17 9.5t-7 19.5v185q0 10 7 19.5t16 10.5l155 24q11 35 32 76q-34 48 -90 115q-7 11 -7 20q0 12 7 20q22 30 82 89t79 59q11 0 21 -7
+l115 -90q34 18 77 32q11 108 23 154q7 24 30 24h186q11 0 20 -7.5t10 -17.5l23 -153q34 -10 75 -31l118 89q8 7 20 7q11 0 21 -8q144 -133 144 -160q0 -8 -7 -19q-12 -16 -42 -54t-45 -60q23 -48 34 -82l152 -23q10 -2 17 -10.5t7 -19.5zM1920 198v-140q0 -16 -149 -31
+q-12 -27 -30 -52q51 -113 51 -138q0 -4 -4 -7q-122 -71 -124 -71q-8 0 -46 47t-52 68q-20 -2 -30 -2t-30 2q-14 -21 -52 -68t-46 -47q-2 0 -124 71q-4 3 -4 7q0 25 51 138q-18 25 -30 52q-149 15 -149 31v140q0 16 149 31q13 29 30 52q-51 113 -51 138q0 4 4 7q4 2 35 20
+t59 34t30 16q8 0 46 -46.5t52 -67.5q20 2 30 2t30 -2q51 71 92 112l6 2q4 0 124 -70q4 -3 4 -7q0 -25 -51 -138q17 -23 30 -52q149 -15 149 -31zM1920 1222v-140q0 -16 -149 -31q-12 -27 -30 -52q51 -113 51 -138q0 -4 -4 -7q-122 -71 -124 -71q-8 0 -46 47t-52 68
+q-20 -2 -30 -2t-30 2q-14 -21 -52 -68t-46 -47q-2 0 -124 71q-4 3 -4 7q0 25 51 138q-18 25 -30 52q-149 15 -149 31v140q0 16 149 31q13 29 30 52q-51 113 -51 138q0 4 4 7q4 2 35 20t59 34t30 16q8 0 46 -46.5t52 -67.5q20 2 30 2t30 -2q51 71 92 112l6 2q4 0 124 -70
+q4 -3 4 -7q0 -25 -51 -138q17 -23 30 -52q149 -15 149 -31z" />
+    <glyph glyph-name="comments" unicode="&#xf086;" horiz-adv-x="1792" 
+d="M1408 768q0 -139 -94 -257t-256.5 -186.5t-353.5 -68.5q-86 0 -176 16q-124 -88 -278 -128q-36 -9 -86 -16h-3q-11 0 -20.5 8t-11.5 21q-1 3 -1 6.5t0.5 6.5t2 6l2.5 5t3.5 5.5t4 5t4.5 5t4 4.5q5 6 23 25t26 29.5t22.5 29t25 38.5t20.5 44q-124 72 -195 177t-71 224
+q0 139 94 257t256.5 186.5t353.5 68.5t353.5 -68.5t256.5 -186.5t94 -257zM1792 512q0 -120 -71 -224.5t-195 -176.5q10 -24 20.5 -44t25 -38.5t22.5 -29t26 -29.5t23 -25q1 -1 4 -4.5t4.5 -5t4 -5t3.5 -5.5l2.5 -5t2 -6t0.5 -6.5t-1 -6.5q-3 -14 -13 -22t-22 -7
+q-50 7 -86 16q-154 40 -278 128q-90 -16 -176 -16q-271 0 -472 132q58 -4 88 -4q161 0 309 45t264 129q125 92 192 212t67 254q0 77 -23 152q129 -71 204 -178t75 -230z" />
+    <glyph glyph-name="thumbs_up_alt" unicode="&#xf087;" 
+d="M256 192q0 26 -19 45t-45 19t-45 -19t-19 -45t19 -45t45 -19t45 19t19 45zM1408 768q0 51 -39 89.5t-89 38.5h-352q0 58 48 159.5t48 160.5q0 98 -32 145t-128 47q-26 -26 -38 -85t-30.5 -125.5t-59.5 -109.5q-22 -23 -77 -91q-4 -5 -23 -30t-31.5 -41t-34.5 -42.5
+t-40 -44t-38.5 -35.5t-40 -27t-35.5 -9h-32v-640h32q13 0 31.5 -3t33 -6.5t38 -11t35 -11.5t35.5 -12.5t29 -10.5q211 -73 342 -73h121q192 0 192 167q0 26 -5 56q30 16 47.5 52.5t17.5 73.5t-18 69q53 50 53 119q0 25 -10 55.5t-25 47.5q32 1 53.5 47t21.5 81zM1536 769
+q0 -89 -49 -163q9 -33 9 -69q0 -77 -38 -144q3 -21 3 -43q0 -101 -60 -178q1 -139 -85 -219.5t-227 -80.5h-36h-93q-96 0 -189.5 22.5t-216.5 65.5q-116 40 -138 40h-288q-53 0 -90.5 37.5t-37.5 90.5v640q0 53 37.5 90.5t90.5 37.5h274q36 24 137 155q58 75 107 128
+q24 25 35.5 85.5t30.5 126.5t62 108q39 37 90 37q84 0 151 -32.5t102 -101.5t35 -186q0 -93 -48 -192h176q104 0 180 -76t76 -179z" />
+    <glyph glyph-name="thumbs_down_alt" unicode="&#xf088;" 
+d="M256 1088q0 26 -19 45t-45 19t-45 -19t-19 -45t19 -45t45 -19t45 19t19 45zM1408 512q0 35 -21.5 81t-53.5 47q15 17 25 47.5t10 55.5q0 69 -53 119q18 31 18 69q0 37 -17.5 73.5t-47.5 52.5q5 30 5 56q0 85 -49 126t-136 41h-128q-131 0 -342 -73q-5 -2 -29 -10.5
+t-35.5 -12.5t-35 -11.5t-38 -11t-33 -6.5t-31.5 -3h-32v-640h32q16 0 35.5 -9t40 -27t38.5 -35.5t40 -44t34.5 -42.5t31.5 -41t23 -30q55 -68 77 -91q41 -43 59.5 -109.5t30.5 -125.5t38 -85q96 0 128 47t32 145q0 59 -48 160.5t-48 159.5h352q50 0 89 38.5t39 89.5z
+M1536 511q0 -103 -76 -179t-180 -76h-176q48 -99 48 -192q0 -118 -35 -186q-35 -69 -102 -101.5t-151 -32.5q-51 0 -90 37q-34 33 -54 82t-25.5 90.5t-17.5 84.5t-31 64q-48 50 -107 127q-101 131 -137 155h-274q-53 0 -90.5 37.5t-37.5 90.5v640q0 53 37.5 90.5t90.5 37.5
+h288q22 0 138 40q128 44 223 66t200 22h112q140 0 226.5 -79t85.5 -216v-5q60 -77 60 -178q0 -22 -3 -43q38 -67 38 -144q0 -36 -9 -69q49 -73 49 -163z" />
+    <glyph glyph-name="star_half" unicode="&#xf089;" horiz-adv-x="896" 
+d="M832 1504v-1339l-449 -236q-22 -12 -40 -12q-21 0 -31.5 14.5t-10.5 35.5q0 6 2 20l86 500l-364 354q-25 27 -25 48q0 37 56 46l502 73l225 455q19 41 49 41z" />
+    <glyph glyph-name="heart_empty" unicode="&#xf08a;" horiz-adv-x="1792" 
+d="M1664 940q0 81 -21.5 143t-55 98.5t-81.5 59.5t-94 31t-98 8t-112 -25.5t-110.5 -64t-86.5 -72t-60 -61.5q-18 -22 -49 -22t-49 22q-24 28 -60 61.5t-86.5 72t-110.5 64t-112 25.5t-98 -8t-94 -31t-81.5 -59.5t-55 -98.5t-21.5 -143q0 -168 187 -355l581 -560l580 559
+q188 188 188 356zM1792 940q0 -221 -229 -450l-623 -600q-18 -18 -44 -18t-44 18l-624 602q-10 8 -27.5 26t-55.5 65.5t-68 97.5t-53.5 121t-23.5 138q0 220 127 344t351 124q62 0 126.5 -21.5t120 -58t95.5 -68.5t76 -68q36 36 76 68t95.5 68.5t120 58t126.5 21.5
+q224 0 351 -124t127 -344z" />
+    <glyph glyph-name="signout" unicode="&#xf08b;" horiz-adv-x="1664" 
+d="M640 96q0 -4 1 -20t0.5 -26.5t-3 -23.5t-10 -19.5t-20.5 -6.5h-320q-119 0 -203.5 84.5t-84.5 203.5v704q0 119 84.5 203.5t203.5 84.5h320q13 0 22.5 -9.5t9.5 -22.5q0 -4 1 -20t0.5 -26.5t-3 -23.5t-10 -19.5t-20.5 -6.5h-320q-66 0 -113 -47t-47 -113v-704
+q0 -66 47 -113t113 -47h288h11h13t11.5 -1t11.5 -3t8 -5.5t7 -9t2 -13.5zM1568 640q0 -26 -19 -45l-544 -544q-19 -19 -45 -19t-45 19t-19 45v288h-448q-26 0 -45 19t-19 45v384q0 26 19 45t45 19h448v288q0 26 19 45t45 19t45 -19l544 -544q19 -19 19 -45z" />
+    <glyph glyph-name="linkedin_sign" unicode="&#xf08c;" 
+d="M237 122h231v694h-231v-694zM483 1030q-1 52 -36 86t-93 34t-94.5 -34t-36.5 -86q0 -51 35.5 -85.5t92.5 -34.5h1q59 0 95 34.5t36 85.5zM1068 122h231v398q0 154 -73 233t-193 79q-136 0 -209 -117h2v101h-231q3 -66 0 -694h231v388q0 38 7 56q15 35 45 59.5t74 24.5
+q116 0 116 -157v-371zM1536 1120v-960q0 -119 -84.5 -203.5t-203.5 -84.5h-960q-119 0 -203.5 84.5t-84.5 203.5v960q0 119 84.5 203.5t203.5 84.5h960q119 0 203.5 -84.5t84.5 -203.5z" />
+    <glyph glyph-name="pushpin" unicode="&#xf08d;" horiz-adv-x="1152" 
+d="M480 672v448q0 14 -9 23t-23 9t-23 -9t-9 -23v-448q0 -14 9 -23t23 -9t23 9t9 23zM1152 320q0 -26 -19 -45t-45 -19h-429l-51 -483q-2 -12 -10.5 -20.5t-20.5 -8.5h-1q-27 0 -32 27l-76 485h-404q-26 0 -45 19t-19 45q0 123 78.5 221.5t177.5 98.5v512q-52 0 -90 38
+t-38 90t38 90t90 38h640q52 0 90 -38t38 -90t-38 -90t-90 -38v-512q99 0 177.5 -98.5t78.5 -221.5z" />
+    <glyph glyph-name="external_link" unicode="&#xf08e;" horiz-adv-x="1792" 
+d="M1408 608v-320q0 -119 -84.5 -203.5t-203.5 -84.5h-832q-119 0 -203.5 84.5t-84.5 203.5v832q0 119 84.5 203.5t203.5 84.5h704q14 0 23 -9t9 -23v-64q0 -14 -9 -23t-23 -9h-704q-66 0 -113 -47t-47 -113v-832q0 -66 47 -113t113 -47h832q66 0 113 47t47 113v320
+q0 14 9 23t23 9h64q14 0 23 -9t9 -23zM1792 1472v-512q0 -26 -19 -45t-45 -19t-45 19l-176 176l-652 -652q-10 -10 -23 -10t-23 10l-114 114q-10 10 -10 23t10 23l652 652l-176 176q-19 19 -19 45t19 45t45 19h512q26 0 45 -19t19 -45z" />
+    <glyph glyph-name="signin" unicode="&#xf090;" 
+d="M1184 640q0 -26 -19 -45l-544 -544q-19 -19 -45 -19t-45 19t-19 45v288h-448q-26 0 -45 19t-19 45v384q0 26 19 45t45 19h448v288q0 26 19 45t45 19t45 -19l544 -544q19 -19 19 -45zM1536 992v-704q0 -119 -84.5 -203.5t-203.5 -84.5h-320q-13 0 -22.5 9.5t-9.5 22.5
+q0 4 -1 20t-0.5 26.5t3 23.5t10 19.5t20.5 6.5h320q66 0 113 47t47 113v704q0 66 -47 113t-113 47h-288h-11h-13t-11.5 1t-11.5 3t-8 5.5t-7 9t-2 13.5q0 4 -1 20t-0.5 26.5t3 23.5t10 19.5t20.5 6.5h320q119 0 203.5 -84.5t84.5 -203.5z" />
+    <glyph glyph-name="trophy" unicode="&#xf091;" horiz-adv-x="1664" 
+d="M458 653q-74 162 -74 371h-256v-96q0 -78 94.5 -162t235.5 -113zM1536 928v96h-256q0 -209 -74 -371q141 29 235.5 113t94.5 162zM1664 1056v-128q0 -71 -41.5 -143t-112 -130t-173 -97.5t-215.5 -44.5q-42 -54 -95 -95q-38 -34 -52.5 -72.5t-14.5 -89.5q0 -54 30.5 -91
+t97.5 -37q75 0 133.5 -45.5t58.5 -114.5v-64q0 -14 -9 -23t-23 -9h-832q-14 0 -23 9t-9 23v64q0 69 58.5 114.5t133.5 45.5q67 0 97.5 37t30.5 91q0 51 -14.5 89.5t-52.5 72.5q-53 41 -95 95q-113 5 -215.5 44.5t-173 97.5t-112 130t-41.5 143v128q0 40 28 68t68 28h288v96
+q0 66 47 113t113 47h576q66 0 113 -47t47 -113v-96h288q40 0 68 -28t28 -68z" />
+    <glyph glyph-name="github_sign" unicode="&#xf092;" 
+d="M519 336q4 6 -3 13q-9 7 -14 2q-4 -6 3 -13q9 -7 14 -2zM491 377q-5 7 -12 4q-6 -4 0 -12q7 -8 12 -5q6 4 0 13zM450 417q2 4 -5 8q-7 2 -8 -2q-3 -5 4 -8q8 -2 9 2zM471 394q2 1 1.5 4.5t-3.5 5.5q-6 7 -10 3t1 -11q6 -6 11 -2zM557 319q2 7 -9 11q-9 3 -13 -4
+q-2 -7 9 -11q9 -3 13 4zM599 316q0 8 -12 8q-10 0 -10 -8t11 -8t11 8zM638 323q-2 7 -13 5t-9 -9q2 -8 12 -6t10 10zM1280 640q0 212 -150 362t-362 150t-362 -150t-150 -362q0 -167 98 -300.5t252 -185.5q18 -3 26.5 5t8.5 20q0 52 -1 95q-6 -1 -15.5 -2.5t-35.5 -2t-48 4
+t-43.5 20t-29.5 41.5q-23 59 -57 74q-2 1 -4.5 3.5l-8 8t-7 9.5t4 7.5t19.5 3.5q6 0 15 -2t30 -15.5t33 -35.5q16 -28 37.5 -42t43.5 -14t38 3.5t30 9.5q7 47 33 69q-49 6 -86 18.5t-73 39t-55.5 76t-19.5 119.5q0 79 53 137q-24 62 5 136q19 6 54.5 -7.5t60.5 -29.5l26 -16
+q58 17 128 17t128 -17q11 7 28.5 18t55.5 26t57 9q29 -74 5 -136q53 -58 53 -137q0 -57 -14 -100.5t-35.5 -70t-53.5 -44.5t-62.5 -26t-68.5 -12q35 -31 35 -95q0 -40 -0.5 -89t-0.5 -51q0 -12 8.5 -20t26.5 -5q154 52 252 185.5t98 300.5zM1536 1120v-960
+q0 -119 -84.5 -203.5t-203.5 -84.5h-960q-119 0 -203.5 84.5t-84.5 203.5v960q0 119 84.5 203.5t203.5 84.5h960q119 0 203.5 -84.5t84.5 -203.5z" />
+    <glyph glyph-name="upload_alt" unicode="&#xf093;" horiz-adv-x="1664" 
+d="M1280 64q0 26 -19 45t-45 19t-45 -19t-19 -45t19 -45t45 -19t45 19t19 45zM1536 64q0 26 -19 45t-45 19t-45 -19t-19 -45t19 -45t45 -19t45 19t19 45zM1664 288v-320q0 -40 -28 -68t-68 -28h-1472q-40 0 -68 28t-28 68v320q0 40 28 68t68 28h427q21 -56 70.5 -92
+t110.5 -36h256q61 0 110.5 36t70.5 92h427q40 0 68 -28t28 -68zM1339 936q-17 -40 -59 -40h-256v-448q0 -26 -19 -45t-45 -19h-256q-26 0 -45 19t-19 45v448h-256q-42 0 -59 40q-17 39 14 69l448 448q18 19 45 19t45 -19l448 -448q31 -30 14 -69z" />
+    <glyph glyph-name="lemon" unicode="&#xf094;" 
+d="M1407 710q0 44 -7 113.5t-18 96.5q-12 30 -17 44t-9 36.5t-4 48.5q0 23 5 68.5t5 67.5q0 37 -10 55q-4 1 -13 1q-19 0 -58 -4.5t-59 -4.5q-60 0 -176 24t-175 24q-43 0 -94.5 -11.5t-85 -23.5t-89.5 -34q-137 -54 -202 -103q-96 -73 -159.5 -189.5t-88 -236t-24.5 -248.5
+q0 -40 12.5 -120t12.5 -121q0 -23 -11 -66.5t-11 -65.5t12 -36.5t34 -14.5q24 0 72.5 11t73.5 11q57 0 169.5 -15.5t169.5 -15.5q181 0 284 36q129 45 235.5 152.5t166 245.5t59.5 275zM1535 712q0 -165 -70 -327.5t-196 -288t-281 -180.5q-124 -44 -326 -44
+q-57 0 -170 14.5t-169 14.5q-24 0 -72.5 -14.5t-73.5 -14.5q-73 0 -123.5 55.5t-50.5 128.5q0 24 11 68t11 67q0 40 -12.5 120.5t-12.5 121.5q0 111 18 217.5t54.5 209.5t100.5 194t150 156q78 59 232 120q194 78 316 78q60 0 175.5 -24t173.5 -24q19 0 57 5t58 5
+q81 0 118 -50.5t37 -134.5q0 -23 -5 -68t-5 -68q0 -13 2 -25t3.5 -16.5t7.5 -20.5t8 -20q16 -40 25 -118.5t9 -136.5z" />
+    <glyph glyph-name="phone" unicode="&#xf095;" horiz-adv-x="1408" 
+d="M1408 296q0 -27 -10 -70.5t-21 -68.5q-21 -50 -122 -106q-94 -51 -186 -51q-27 0 -53 3.5t-57.5 12.5t-47 14.5t-55.5 20.5t-49 18q-98 35 -175 83q-127 79 -264 216t-216 264q-48 77 -83 175q-3 9 -18 49t-20.5 55.5t-14.5 47t-12.5 57.5t-3.5 53q0 92 51 186
+q56 101 106 122q25 11 68.5 21t70.5 10q14 0 21 -3q18 -6 53 -76q11 -19 30 -54t35 -63.5t31 -53.5q3 -4 17.5 -25t21.5 -35.5t7 -28.5q0 -20 -28.5 -50t-62 -55t-62 -53t-28.5 -46q0 -9 5 -22.5t8.5 -20.5t14 -24t11.5 -19q76 -137 174 -235t235 -174q2 -1 19 -11.5t24 -14
+t20.5 -8.5t22.5 -5q18 0 46 28.5t53 62t55 62t50 28.5q14 0 28.5 -7t35.5 -21.5t25 -17.5q25 -15 53.5 -31t63.5 -35t54 -30q70 -35 76 -53q3 -7 3 -21z" />
+    <glyph glyph-name="check_empty" unicode="&#xf096;" horiz-adv-x="1408" 
+d="M1120 1280h-832q-66 0 -113 -47t-47 -113v-832q0 -66 47 -113t113 -47h832q66 0 113 47t47 113v832q0 66 -47 113t-113 47zM1408 1120v-832q0 -119 -84.5 -203.5t-203.5 -84.5h-832q-119 0 -203.5 84.5t-84.5 203.5v832q0 119 84.5 203.5t203.5 84.5h832
+q119 0 203.5 -84.5t84.5 -203.5z" />
+    <glyph glyph-name="bookmark_empty" unicode="&#xf097;" horiz-adv-x="1280" 
+d="M1152 1280h-1024v-1242l423 406l89 85l89 -85l423 -406v1242zM1164 1408q23 0 44 -9q33 -13 52.5 -41t19.5 -62v-1289q0 -34 -19.5 -62t-52.5 -41q-19 -8 -44 -8q-48 0 -83 32l-441 424l-441 -424q-36 -33 -83 -33q-23 0 -44 9q-33 13 -52.5 41t-19.5 62v1289
+q0 34 19.5 62t52.5 41q21 9 44 9h1048z" />
+    <glyph glyph-name="phone_sign" unicode="&#xf098;" 
+d="M1280 343q0 11 -2 16t-18 16.5t-40.5 25t-47.5 26.5t-45.5 25t-28.5 15q-5 3 -19 13t-25 15t-21 5q-15 0 -36.5 -20.5t-39.5 -45t-38.5 -45t-33.5 -20.5q-7 0 -16.5 3.5t-15.5 6.5t-17 9.5t-14 8.5q-99 55 -170 126.5t-127 170.5q-2 3 -8.5 14t-9.5 17t-6.5 15.5
+t-3.5 16.5q0 13 20.5 33.5t45 38.5t45 39.5t20.5 36.5q0 10 -5 21t-15 25t-13 19q-3 6 -15 28.5t-25 45.5t-26.5 47.5t-25 40.5t-16.5 18t-16 2q-48 0 -101 -22q-46 -21 -80 -94.5t-34 -130.5q0 -16 2.5 -34t5 -30.5t9 -33t10 -29.5t12.5 -33t11 -30q60 -164 216.5 -320.5
+t320.5 -216.5q6 -2 30 -11t33 -12.5t29.5 -10t33 -9t30.5 -5t34 -2.5q57 0 130.5 34t94.5 80q22 53 22 101zM1536 1120v-960q0 -119 -84.5 -203.5t-203.5 -84.5h-960q-119 0 -203.5 84.5t-84.5 203.5v960q0 119 84.5 203.5t203.5 84.5h960q119 0 203.5 -84.5t84.5 -203.5z
+" />
+    <glyph glyph-name="twitter" unicode="&#xf099;" horiz-adv-x="1664" 
+d="M1620 1128q-67 -98 -162 -167q1 -14 1 -42q0 -130 -38 -259.5t-115.5 -248.5t-184.5 -210.5t-258 -146t-323 -54.5q-271 0 -496 145q35 -4 78 -4q225 0 401 138q-105 2 -188 64.5t-114 159.5q33 -5 61 -5q43 0 85 11q-112 23 -185.5 111.5t-73.5 205.5v4q68 -38 146 -41
+q-66 44 -105 115t-39 154q0 88 44 163q121 -149 294.5 -238.5t371.5 -99.5q-8 38 -8 74q0 134 94.5 228.5t228.5 94.5q140 0 236 -102q109 21 205 78q-37 -115 -142 -178q93 10 186 50z" />
+    <glyph glyph-name="facebook" unicode="&#xf09a;" horiz-adv-x="1024" 
+d="M959 1524v-264h-157q-86 0 -116 -36t-30 -108v-189h293l-39 -296h-254v-759h-306v759h-255v296h255v218q0 186 104 288.5t277 102.5q147 0 228 -12z" />
+    <glyph glyph-name="github" unicode="&#xf09b;" 
+d="M768 1408q209 0 385.5 -103t279.5 -279.5t103 -385.5q0 -251 -146.5 -451.5t-378.5 -277.5q-27 -5 -40 7t-13 30q0 3 0.5 76.5t0.5 134.5q0 97 -52 142q57 6 102.5 18t94 39t81 66.5t53 105t20.5 150.5q0 119 -79 206q37 91 -8 204q-28 9 -81 -11t-92 -44l-38 -24
+q-93 26 -192 26t-192 -26q-16 11 -42.5 27t-83.5 38.5t-85 13.5q-45 -113 -8 -204q-79 -87 -79 -206q0 -85 20.5 -150t52.5 -105t80.5 -67t94 -39t102.5 -18q-39 -36 -49 -103q-21 -10 -45 -15t-57 -5t-65.5 21.5t-55.5 62.5q-19 32 -48.5 52t-49.5 24l-20 3q-21 0 -29 -4.5
+t-5 -11.5t9 -14t13 -12l7 -5q22 -10 43.5 -38t31.5 -51l10 -23q13 -38 44 -61.5t67 -30t69.5 -7t55.5 3.5l23 4q0 -38 0.5 -88.5t0.5 -54.5q0 -18 -13 -30t-40 -7q-232 77 -378.5 277.5t-146.5 451.5q0 209 103 385.5t279.5 279.5t385.5 103zM291 305q3 7 -7 12
+q-10 3 -13 -2q-3 -7 7 -12q9 -6 13 2zM322 271q7 5 -2 16q-10 9 -16 3q-7 -5 2 -16q10 -10 16 -3zM352 226q9 7 0 19q-8 13 -17 6q-9 -5 0 -18t17 -7zM394 184q8 8 -4 19q-12 12 -20 3q-9 -8 4 -19q12 -12 20 -3zM451 159q3 11 -13 16q-15 4 -19 -7t13 -15q15 -6 19 6z
+M514 154q0 13 -17 11q-16 0 -16 -11q0 -13 17 -11q16 0 16 11zM572 164q-2 11 -18 9q-16 -3 -14 -15t18 -8t14 14z" />
+    <glyph glyph-name="unlock" unicode="&#xf09c;" horiz-adv-x="1664" 
+d="M1664 960v-256q0 -26 -19 -45t-45 -19h-64q-26 0 -45 19t-19 45v256q0 106 -75 181t-181 75t-181 -75t-75 -181v-192h96q40 0 68 -28t28 -68v-576q0 -40 -28 -68t-68 -28h-960q-40 0 -68 28t-28 68v576q0 40 28 68t68 28h672v192q0 185 131.5 316.5t316.5 131.5
+t316.5 -131.5t131.5 -316.5z" />
+    <glyph glyph-name="credit_card" unicode="&#xf09d;" horiz-adv-x="1920" 
+d="M1760 1408q66 0 113 -47t47 -113v-1216q0 -66 -47 -113t-113 -47h-1600q-66 0 -113 47t-47 113v1216q0 66 47 113t113 47h1600zM160 1280q-13 0 -22.5 -9.5t-9.5 -22.5v-224h1664v224q0 13 -9.5 22.5t-22.5 9.5h-1600zM1760 0q13 0 22.5 9.5t9.5 22.5v608h-1664v-608
+q0 -13 9.5 -22.5t22.5 -9.5h1600zM256 128v128h256v-128h-256zM640 128v128h384v-128h-384z" />
+    <glyph glyph-name="rss" unicode="&#xf09e;" horiz-adv-x="1408" 
+d="M384 192q0 -80 -56 -136t-136 -56t-136 56t-56 136t56 136t136 56t136 -56t56 -136zM896 69q2 -28 -17 -48q-18 -21 -47 -21h-135q-25 0 -43 16.5t-20 41.5q-22 229 -184.5 391.5t-391.5 184.5q-25 2 -41.5 20t-16.5 43v135q0 29 21 47q17 17 43 17h5q160 -13 306 -80.5
+t259 -181.5q114 -113 181.5 -259t80.5 -306zM1408 67q2 -27 -18 -47q-18 -20 -46 -20h-143q-26 0 -44.5 17.5t-19.5 42.5q-12 215 -101 408.5t-231.5 336t-336 231.5t-408.5 102q-25 1 -42.5 19.5t-17.5 43.5v143q0 28 20 46q18 18 44 18h3q262 -13 501.5 -120t425.5 -294
+q187 -186 294 -425.5t120 -501.5z" />
+    <glyph glyph-name="hdd" unicode="&#xf0a0;" 
+d="M1040 320q0 -33 -23.5 -56.5t-56.5 -23.5t-56.5 23.5t-23.5 56.5t23.5 56.5t56.5 23.5t56.5 -23.5t23.5 -56.5zM1296 320q0 -33 -23.5 -56.5t-56.5 -23.5t-56.5 23.5t-23.5 56.5t23.5 56.5t56.5 23.5t56.5 -23.5t23.5 -56.5zM1408 160v320q0 13 -9.5 22.5t-22.5 9.5
+h-1216q-13 0 -22.5 -9.5t-9.5 -22.5v-320q0 -13 9.5 -22.5t22.5 -9.5h1216q13 0 22.5 9.5t9.5 22.5zM178 640h1180l-157 482q-4 13 -16 21.5t-26 8.5h-782q-14 0 -26 -8.5t-16 -21.5zM1536 480v-320q0 -66 -47 -113t-113 -47h-1216q-66 0 -113 47t-47 113v320q0 25 16 75
+l197 606q17 53 63 86t101 33h782q55 0 101 -33t63 -86l197 -606q16 -50 16 -75z" />
+    <glyph glyph-name="bullhorn" unicode="&#xf0a1;" horiz-adv-x="1792" 
+d="M1664 896q53 0 90.5 -37.5t37.5 -90.5t-37.5 -90.5t-90.5 -37.5v-384q0 -52 -38 -90t-90 -38q-417 347 -812 380q-58 -19 -91 -66t-31 -100.5t40 -92.5q-20 -33 -23 -65.5t6 -58t33.5 -55t48 -50t61.5 -50.5q-29 -58 -111.5 -83t-168.5 -11.5t-132 55.5q-7 23 -29.5 87.5
+t-32 94.5t-23 89t-15 101t3.5 98.5t22 110.5h-122q-66 0 -113 47t-47 113v192q0 66 47 113t113 47h480q435 0 896 384q52 0 90 -38t38 -90v-384zM1536 292v954q-394 -302 -768 -343v-270q377 -42 768 -341z" />
+    <glyph glyph-name="bell" unicode="&#xf0a2;" horiz-adv-x="1792" 
+d="M912 -160q0 16 -16 16q-59 0 -101.5 42.5t-42.5 101.5q0 16 -16 16t-16 -16q0 -73 51.5 -124.5t124.5 -51.5q16 0 16 16zM246 128h1300q-266 300 -266 832q0 51 -24 105t-69 103t-121.5 80.5t-169.5 31.5t-169.5 -31.5t-121.5 -80.5t-69 -103t-24 -105q0 -532 -266 -832z
+M1728 128q0 -52 -38 -90t-90 -38h-448q0 -106 -75 -181t-181 -75t-181 75t-75 181h-448q-52 0 -90 38t-38 90q50 42 91 88t85 119.5t74.5 158.5t50 206t19.5 260q0 152 117 282.5t307 158.5q-8 19 -8 39q0 40 28 68t68 28t68 -28t28 -68q0 -20 -8 -39q190 -28 307 -158.5
+t117 -282.5q0 -139 19.5 -260t50 -206t74.5 -158.5t85 -119.5t91 -88z" />
+    <glyph glyph-name="certificate" unicode="&#xf0a3;" 
+d="M1376 640l138 -135q30 -28 20 -70q-12 -41 -52 -51l-188 -48l53 -186q12 -41 -19 -70q-29 -31 -70 -19l-186 53l-48 -188q-10 -40 -51 -52q-12 -2 -19 -2q-31 0 -51 22l-135 138l-135 -138q-28 -30 -70 -20q-41 11 -51 52l-48 188l-186 -53q-41 -12 -70 19q-31 29 -19 70
+l53 186l-188 48q-40 10 -52 51q-10 42 20 70l138 135l-138 135q-30 28 -20 70q12 41 52 51l188 48l-53 186q-12 41 19 70q29 31 70 19l186 -53l48 188q10 41 51 51q41 12 70 -19l135 -139l135 139q29 30 70 19q41 -10 51 -51l48 -188l186 53q41 12 70 -19q31 -29 19 -70
+l-53 -186l188 -48q40 -10 52 -51q10 -42 -20 -70z" />
+    <glyph glyph-name="hand_right" unicode="&#xf0a4;" horiz-adv-x="1792" 
+d="M256 192q0 26 -19 45t-45 19t-45 -19t-19 -45t19 -45t45 -19t45 19t19 45zM1664 768q0 51 -39 89.5t-89 38.5h-576q0 20 15 48.5t33 55t33 68t15 84.5q0 67 -44.5 97.5t-115.5 30.5q-24 0 -90 -139q-24 -44 -37 -65q-40 -64 -112 -145q-71 -81 -101 -106
+q-69 -57 -140 -57h-32v-640h32q72 0 167 -32t193.5 -64t179.5 -32q189 0 189 167q0 26 -5 56q30 16 47.5 52.5t17.5 73.5t-18 69q53 50 53 119q0 25 -10 55.5t-25 47.5h331q52 0 90 38t38 90zM1792 769q0 -105 -75.5 -181t-180.5 -76h-169q-4 -62 -37 -119q3 -21 3 -43
+q0 -101 -60 -178q1 -139 -85 -219.5t-227 -80.5q-133 0 -322 69q-164 59 -223 59h-288q-53 0 -90.5 37.5t-37.5 90.5v640q0 53 37.5 90.5t90.5 37.5h288q10 0 21.5 4.5t23.5 14t22.5 18t24 22.5t20.5 21.5t19 21.5t14 17q65 74 100 129q13 21 33 62t37 72t40.5 63t55 49.5
+t69.5 17.5q125 0 206.5 -67t81.5 -189q0 -68 -22 -128h374q104 0 180 -76t76 -179z" />
+    <glyph glyph-name="hand_left" unicode="&#xf0a5;" horiz-adv-x="1792" 
+d="M1376 128h32v640h-32q-35 0 -67.5 12t-62.5 37t-50 46t-49 54q-8 9 -12 14q-72 81 -112 145q-14 22 -38 68q-1 3 -10.5 22.5t-18.5 36t-20 35.5t-21.5 30.5t-18.5 11.5q-71 0 -115.5 -30.5t-44.5 -97.5q0 -43 15 -84.5t33 -68t33 -55t15 -48.5h-576q-50 0 -89 -38.5
+t-39 -89.5q0 -52 38 -90t90 -38h331q-15 -17 -25 -47.5t-10 -55.5q0 -69 53 -119q-18 -32 -18 -69t17.5 -73.5t47.5 -52.5q-4 -24 -4 -56q0 -85 48.5 -126t135.5 -41q84 0 183 32t194 64t167 32zM1664 192q0 26 -19 45t-45 19t-45 -19t-19 -45t19 -45t45 -19t45 19t19 45z
+M1792 768v-640q0 -53 -37.5 -90.5t-90.5 -37.5h-288q-59 0 -223 -59q-190 -69 -317 -69q-142 0 -230 77.5t-87 217.5l1 5q-61 76 -61 178q0 22 3 43q-33 57 -37 119h-169q-105 0 -180.5 76t-75.5 181q0 103 76 179t180 76h374q-22 60 -22 128q0 122 81.5 189t206.5 67
+q38 0 69.5 -17.5t55 -49.5t40.5 -63t37 -72t33 -62q35 -55 100 -129q2 -3 14 -17t19 -21.5t20.5 -21.5t24 -22.5t22.5 -18t23.5 -14t21.5 -4.5h288q53 0 90.5 -37.5t37.5 -90.5z" />
+    <glyph glyph-name="hand_up" unicode="&#xf0a6;" 
+d="M1280 -64q0 26 -19 45t-45 19t-45 -19t-19 -45t19 -45t45 -19t45 19t19 45zM1408 700q0 189 -167 189q-26 0 -56 -5q-16 30 -52.5 47.5t-73.5 17.5t-69 -18q-50 53 -119 53q-25 0 -55.5 -10t-47.5 -25v331q0 52 -38 90t-90 38q-51 0 -89.5 -39t-38.5 -89v-576
+q-20 0 -48.5 15t-55 33t-68 33t-84.5 15q-67 0 -97.5 -44.5t-30.5 -115.5q0 -24 139 -90q44 -24 65 -37q64 -40 145 -112q81 -71 106 -101q57 -69 57 -140v-32h640v32q0 72 32 167t64 193.5t32 179.5zM1536 705q0 -133 -69 -322q-59 -164 -59 -223v-288q0 -53 -37.5 -90.5
+t-90.5 -37.5h-640q-53 0 -90.5 37.5t-37.5 90.5v288q0 10 -4.5 21.5t-14 23.5t-18 22.5t-22.5 24t-21.5 20.5t-21.5 19t-17 14q-74 65 -129 100q-21 13 -62 33t-72 37t-63 40.5t-49.5 55t-17.5 69.5q0 125 67 206.5t189 81.5q68 0 128 -22v374q0 104 76 180t179 76
+q105 0 181 -75.5t76 -180.5v-169q62 -4 119 -37q21 3 43 3q101 0 178 -60q139 1 219.5 -85t80.5 -227z" />
+    <glyph glyph-name="hand_down" unicode="&#xf0a7;" 
+d="M1408 576q0 84 -32 183t-64 194t-32 167v32h-640v-32q0 -35 -12 -67.5t-37 -62.5t-46 -50t-54 -49q-9 -8 -14 -12q-81 -72 -145 -112q-22 -14 -68 -38q-3 -1 -22.5 -10.5t-36 -18.5t-35.5 -20t-30.5 -21.5t-11.5 -18.5q0 -71 30.5 -115.5t97.5 -44.5q43 0 84.5 15t68 33
+t55 33t48.5 15v-576q0 -50 38.5 -89t89.5 -39q52 0 90 38t38 90v331q46 -35 103 -35q69 0 119 53q32 -18 69 -18t73.5 17.5t52.5 47.5q24 -4 56 -4q85 0 126 48.5t41 135.5zM1280 1344q0 26 -19 45t-45 19t-45 -19t-19 -45t19 -45t45 -19t45 19t19 45zM1536 580
+q0 -142 -77.5 -230t-217.5 -87l-5 1q-76 -61 -178 -61q-22 0 -43 3q-54 -30 -119 -37v-169q0 -105 -76 -180.5t-181 -75.5q-103 0 -179 76t-76 180v374q-54 -22 -128 -22q-121 0 -188.5 81.5t-67.5 206.5q0 38 17.5 69.5t49.5 55t63 40.5t72 37t62 33q55 35 129 100
+q3 2 17 14t21.5 19t21.5 20.5t22.5 24t18 22.5t14 23.5t4.5 21.5v288q0 53 37.5 90.5t90.5 37.5h640q53 0 90.5 -37.5t37.5 -90.5v-288q0 -59 59 -223q69 -190 69 -317z" />
+    <glyph glyph-name="circle_arrow_left" unicode="&#xf0a8;" 
+d="M1280 576v128q0 26 -19 45t-45 19h-502l189 189q19 19 19 45t-19 45l-91 91q-18 18 -45 18t-45 -18l-362 -362l-91 -91q-18 -18 -18 -45t18 -45l91 -91l362 -362q18 -18 45 -18t45 18l91 91q18 18 18 45t-18 45l-189 189h502q26 0 45 19t19 45zM1536 640
+q0 -209 -103 -385.5t-279.5 -279.5t-385.5 -103t-385.5 103t-279.5 279.5t-103 385.5t103 385.5t279.5 279.5t385.5 103t385.5 -103t279.5 -279.5t103 -385.5z" />
+    <glyph glyph-name="circle_arrow_right" unicode="&#xf0a9;" 
+d="M1285 640q0 27 -18 45l-91 91l-362 362q-18 18 -45 18t-45 -18l-91 -91q-18 -18 -18 -45t18 -45l189 -189h-502q-26 0 -45 -19t-19 -45v-128q0 -26 19 -45t45 -19h502l-189 -189q-19 -19 -19 -45t19 -45l91 -91q18 -18 45 -18t45 18l362 362l91 91q18 18 18 45zM1536 640
+q0 -209 -103 -385.5t-279.5 -279.5t-385.5 -103t-385.5 103t-279.5 279.5t-103 385.5t103 385.5t279.5 279.5t385.5 103t385.5 -103t279.5 -279.5t103 -385.5z" />
+    <glyph glyph-name="circle_arrow_up" unicode="&#xf0aa;" 
+d="M1284 641q0 27 -18 45l-362 362l-91 91q-18 18 -45 18t-45 -18l-91 -91l-362 -362q-18 -18 -18 -45t18 -45l91 -91q18 -18 45 -18t45 18l189 189v-502q0 -26 19 -45t45 -19h128q26 0 45 19t19 45v502l189 -189q19 -19 45 -19t45 19l91 91q18 18 18 45zM1536 640
+q0 -209 -103 -385.5t-279.5 -279.5t-385.5 -103t-385.5 103t-279.5 279.5t-103 385.5t103 385.5t279.5 279.5t385.5 103t385.5 -103t279.5 -279.5t103 -385.5z" />
+    <glyph glyph-name="circle_arrow_down" unicode="&#xf0ab;" 
+d="M1284 639q0 27 -18 45l-91 91q-18 18 -45 18t-45 -18l-189 -189v502q0 26 -19 45t-45 19h-128q-26 0 -45 -19t-19 -45v-502l-189 189q-19 19 -45 19t-45 -19l-91 -91q-18 -18 -18 -45t18 -45l362 -362l91 -91q18 -18 45 -18t45 18l91 91l362 362q18 18 18 45zM1536 640
+q0 -209 -103 -385.5t-279.5 -279.5t-385.5 -103t-385.5 103t-279.5 279.5t-103 385.5t103 385.5t279.5 279.5t385.5 103t385.5 -103t279.5 -279.5t103 -385.5z" />
+    <glyph glyph-name="globe" unicode="&#xf0ac;" 
+d="M768 1408q209 0 385.5 -103t279.5 -279.5t103 -385.5t-103 -385.5t-279.5 -279.5t-385.5 -103t-385.5 103t-279.5 279.5t-103 385.5t103 385.5t279.5 279.5t385.5 103zM1042 887q-2 -1 -9.5 -9.5t-13.5 -9.5q2 0 4.5 5t5 11t3.5 7q6 7 22 15q14 6 52 12q34 8 51 -11
+q-2 2 9.5 13t14.5 12q3 2 15 4.5t15 7.5l2 22q-12 -1 -17.5 7t-6.5 21q0 -2 -6 -8q0 7 -4.5 8t-11.5 -1t-9 -1q-10 3 -15 7.5t-8 16.5t-4 15q-2 5 -9.5 11t-9.5 10q-1 2 -2.5 5.5t-3 6.5t-4 5.5t-5.5 2.5t-7 -5t-7.5 -10t-4.5 -5q-3 2 -6 1.5t-4.5 -1t-4.5 -3t-5 -3.5
+q-3 -2 -8.5 -3t-8.5 -2q15 5 -1 11q-10 4 -16 3q9 4 7.5 12t-8.5 14h5q-1 4 -8.5 8.5t-17.5 8.5t-13 6q-8 5 -34 9.5t-33 0.5q-5 -6 -4.5 -10.5t4 -14t3.5 -12.5q1 -6 -5.5 -13t-6.5 -12q0 -7 14 -15.5t10 -21.5q-3 -8 -16 -16t-16 -12q-5 -8 -1.5 -18.5t10.5 -16.5
+q2 -2 1.5 -4t-3.5 -4.5t-5.5 -4t-6.5 -3.5l-3 -2q-11 -5 -20.5 6t-13.5 26q-7 25 -16 30q-23 8 -29 -1q-5 13 -41 26q-25 9 -58 4q6 1 0 15q-7 15 -19 12q3 6 4 17.5t1 13.5q3 13 12 23q1 1 7 8.5t9.5 13.5t0.5 6q35 -4 50 11q5 5 11.5 17t10.5 17q9 6 14 5.5t14.5 -5.5
+t14.5 -5q14 -1 15.5 11t-7.5 20q12 -1 3 17q-4 7 -8 9q-12 4 -27 -5q-8 -4 2 -8q-1 1 -9.5 -10.5t-16.5 -17.5t-16 5q-1 1 -5.5 13.5t-9.5 13.5q-8 0 -16 -15q3 8 -11 15t-24 8q19 12 -8 27q-7 4 -20.5 5t-19.5 -4q-5 -7 -5.5 -11.5t5 -8t10.5 -5.5t11.5 -4t8.5 -3
+q14 -10 8 -14q-2 -1 -8.5 -3.5t-11.5 -4.5t-6 -4q-3 -4 0 -14t-2 -14q-5 5 -9 17.5t-7 16.5q7 -9 -25 -6l-10 1q-4 0 -16 -2t-20.5 -1t-13.5 8q-4 8 0 20q1 4 4 2q-4 3 -11 9.5t-10 8.5q-46 -15 -94 -41q6 -1 12 1q5 2 13 6.5t10 5.5q34 14 42 7l5 5q14 -16 20 -25
+q-7 4 -30 1q-20 -6 -22 -12q7 -12 5 -18q-4 3 -11.5 10t-14.5 11t-15 5q-16 0 -22 -1q-146 -80 -235 -222q7 -7 12 -8q4 -1 5 -9t2.5 -11t11.5 3q9 -8 3 -19q1 1 44 -27q19 -17 21 -21q3 -11 -10 -18q-1 2 -9 9t-9 4q-3 -5 0.5 -18.5t10.5 -12.5q-7 0 -9.5 -16t-2.5 -35.5
+t-1 -23.5l2 -1q-3 -12 5.5 -34.5t21.5 -19.5q-13 -3 20 -43q6 -8 8 -9q3 -2 12 -7.5t15 -10t10 -10.5q4 -5 10 -22.5t14 -23.5q-2 -6 9.5 -20t10.5 -23q-1 0 -2.5 -1t-2.5 -1q3 -7 15.5 -14t15.5 -13q1 -3 2 -10t3 -11t8 -2q2 20 -24 62q-15 25 -17 29q-3 5 -5.5 15.5
+t-4.5 14.5q2 0 6 -1.5t8.5 -3.5t7.5 -4t2 -3q-3 -7 2 -17.5t12 -18.5t17 -19t12 -13q6 -6 14 -19.5t0 -13.5q9 0 20 -10.5t17 -19.5q5 -8 8 -26t5 -24q2 -7 8.5 -13.5t12.5 -9.5l16 -8t13 -7q5 -2 18.5 -10.5t21.5 -11.5q10 -4 16 -4t14.5 2.5t13.5 3.5q15 2 29 -15t21 -21
+q36 -19 55 -11q-2 -1 0.5 -7.5t8 -15.5t9 -14.5t5.5 -8.5q5 -6 18 -15t18 -15q6 4 7 9q-3 -8 7 -20t18 -10q14 3 14 32q-31 -15 -49 18q0 1 -2.5 5.5t-4 8.5t-2.5 8.5t0 7.5t5 3q9 0 10 3.5t-2 12.5t-4 13q-1 8 -11 20t-12 15q-5 -9 -16 -8t-16 9q0 -1 -1.5 -5.5t-1.5 -6.5
+q-13 0 -15 1q1 3 2.5 17.5t3.5 22.5q1 4 5.5 12t7.5 14.5t4 12.5t-4.5 9.5t-17.5 2.5q-19 -1 -26 -20q-1 -3 -3 -10.5t-5 -11.5t-9 -7q-7 -3 -24 -2t-24 5q-13 8 -22.5 29t-9.5 37q0 10 2.5 26.5t3 25t-5.5 24.5q3 2 9 9.5t10 10.5q2 1 4.5 1.5t4.5 0t4 1.5t3 6q-1 1 -4 3
+q-3 3 -4 3q7 -3 28.5 1.5t27.5 -1.5q15 -11 22 2q0 1 -2.5 9.5t-0.5 13.5q5 -27 29 -9q3 -3 15.5 -5t17.5 -5q3 -2 7 -5.5t5.5 -4.5t5 0.5t8.5 6.5q10 -14 12 -24q11 -40 19 -44q7 -3 11 -2t4.5 9.5t0 14t-1.5 12.5l-1 8v18l-1 8q-15 3 -18.5 12t1.5 18.5t15 18.5q1 1 8 3.5
+t15.5 6.5t12.5 8q21 19 15 35q7 0 11 9q-1 0 -5 3t-7.5 5t-4.5 2q9 5 2 16q5 3 7.5 11t7.5 10q9 -12 21 -2q8 8 1 16q5 7 20.5 10.5t18.5 9.5q7 -2 8 2t1 12t3 12q4 5 15 9t13 5l17 11q3 4 0 4q18 -2 31 11q10 11 -6 20q3 6 -3 9.5t-15 5.5q3 1 11.5 0.5t10.5 1.5
+q15 10 -7 16q-17 5 -43 -12zM879 10q206 36 351 189q-3 3 -12.5 4.5t-12.5 3.5q-18 7 -24 8q1 7 -2.5 13t-8 9t-12.5 8t-11 7q-2 2 -7 6t-7 5.5t-7.5 4.5t-8.5 2t-10 -1l-3 -1q-3 -1 -5.5 -2.5t-5.5 -3t-4 -3t0 -2.5q-21 17 -36 22q-5 1 -11 5.5t-10.5 7t-10 1.5t-11.5 -7
+q-5 -5 -6 -15t-2 -13q-7 5 0 17.5t2 18.5q-3 6 -10.5 4.5t-12 -4.5t-11.5 -8.5t-9 -6.5t-8.5 -5.5t-8.5 -7.5q-3 -4 -6 -12t-5 -11q-2 4 -11.5 6.5t-9.5 5.5q2 -10 4 -35t5 -38q7 -31 -12 -48q-27 -25 -29 -40q-4 -22 12 -26q0 -7 -8 -20.5t-7 -21.5q0 -6 2 -16z" />
+    <glyph glyph-name="wrench" unicode="&#xf0ad;" horiz-adv-x="1664" 
+d="M384 64q0 26 -19 45t-45 19t-45 -19t-19 -45t19 -45t45 -19t45 19t19 45zM1028 484l-682 -682q-37 -37 -90 -37q-52 0 -91 37l-106 108q-38 36 -38 90q0 53 38 91l681 681q39 -98 114.5 -173.5t173.5 -114.5zM1662 919q0 -39 -23 -106q-47 -134 -164.5 -217.5
+t-258.5 -83.5q-185 0 -316.5 131.5t-131.5 316.5t131.5 316.5t316.5 131.5q58 0 121.5 -16.5t107.5 -46.5q16 -11 16 -28t-16 -28l-293 -169v-224l193 -107q5 3 79 48.5t135.5 81t70.5 35.5q15 0 23.5 -10t8.5 -25z" />
+    <glyph glyph-name="tasks" unicode="&#xf0ae;" horiz-adv-x="1792" 
+d="M1024 128h640v128h-640v-128zM640 640h1024v128h-1024v-128zM1280 1152h384v128h-384v-128zM1792 320v-256q0 -26 -19 -45t-45 -19h-1664q-26 0 -45 19t-19 45v256q0 26 19 45t45 19h1664q26 0 45 -19t19 -45zM1792 832v-256q0 -26 -19 -45t-45 -19h-1664q-26 0 -45 19
+t-19 45v256q0 26 19 45t45 19h1664q26 0 45 -19t19 -45zM1792 1344v-256q0 -26 -19 -45t-45 -19h-1664q-26 0 -45 19t-19 45v256q0 26 19 45t45 19h1664q26 0 45 -19t19 -45z" />
+    <glyph glyph-name="filter" unicode="&#xf0b0;" horiz-adv-x="1408" 
+d="M1403 1241q17 -41 -14 -70l-493 -493v-742q0 -42 -39 -59q-13 -5 -25 -5q-27 0 -45 19l-256 256q-19 19 -19 45v486l-493 493q-31 29 -14 70q17 39 59 39h1280q42 0 59 -39z" />
+    <glyph glyph-name="briefcase" unicode="&#xf0b1;" horiz-adv-x="1792" 
+d="M640 1280h512v128h-512v-128zM1792 640v-480q0 -66 -47 -113t-113 -47h-1472q-66 0 -113 47t-47 113v480h672v-160q0 -26 19 -45t45 -19h320q26 0 45 19t19 45v160h672zM1024 640v-128h-256v128h256zM1792 1120v-384h-1792v384q0 66 47 113t113 47h352v160q0 40 28 68
+t68 28h576q40 0 68 -28t28 -68v-160h352q66 0 113 -47t47 -113z" />
+    <glyph glyph-name="fullscreen" unicode="&#xf0b2;" 
+d="M1283 995l-355 -355l355 -355l144 144q29 31 70 14q39 -17 39 -59v-448q0 -26 -19 -45t-45 -19h-448q-42 0 -59 40q-17 39 14 69l144 144l-355 355l-355 -355l144 -144q31 -30 14 -69q-17 -40 -59 -40h-448q-26 0 -45 19t-19 45v448q0 42 40 59q39 17 69 -14l144 -144
+l355 355l-355 355l-144 -144q-19 -19 -45 -19q-12 0 -24 5q-40 17 -40 59v448q0 26 19 45t45 19h448q42 0 59 -40q17 -39 -14 -69l-144 -144l355 -355l355 355l-144 144q-31 30 -14 69q17 40 59 40h448q26 0 45 -19t19 -45v-448q0 -42 -39 -59q-13 -5 -25 -5q-26 0 -45 19z
+" />
+    <glyph glyph-name="group" unicode="&#xf0c0;" horiz-adv-x="1920" 
+d="M593 640q-162 -5 -265 -128h-134q-82 0 -138 40.5t-56 118.5q0 353 124 353q6 0 43.5 -21t97.5 -42.5t119 -21.5q67 0 133 23q-5 -37 -5 -66q0 -139 81 -256zM1664 3q0 -120 -73 -189.5t-194 -69.5h-874q-121 0 -194 69.5t-73 189.5q0 53 3.5 103.5t14 109t26.5 108.5
+t43 97.5t62 81t85.5 53.5t111.5 20q10 0 43 -21.5t73 -48t107 -48t135 -21.5t135 21.5t107 48t73 48t43 21.5q61 0 111.5 -20t85.5 -53.5t62 -81t43 -97.5t26.5 -108.5t14 -109t3.5 -103.5zM640 1280q0 -106 -75 -181t-181 -75t-181 75t-75 181t75 181t181 75t181 -75
+t75 -181zM1344 896q0 -159 -112.5 -271.5t-271.5 -112.5t-271.5 112.5t-112.5 271.5t112.5 271.5t271.5 112.5t271.5 -112.5t112.5 -271.5zM1920 671q0 -78 -56 -118.5t-138 -40.5h-134q-103 123 -265 128q81 117 81 256q0 29 -5 66q66 -23 133 -23q59 0 119 21.5t97.5 42.5
+t43.5 21q124 0 124 -353zM1792 1280q0 -106 -75 -181t-181 -75t-181 75t-75 181t75 181t181 75t181 -75t75 -181z" />
+    <glyph glyph-name="link" unicode="&#xf0c1;" horiz-adv-x="1664" 
+d="M1456 320q0 40 -28 68l-208 208q-28 28 -68 28q-42 0 -72 -32q3 -3 19 -18.5t21.5 -21.5t15 -19t13 -25.5t3.5 -27.5q0 -40 -28 -68t-68 -28q-15 0 -27.5 3.5t-25.5 13t-19 15t-21.5 21.5t-18.5 19q-33 -31 -33 -73q0 -40 28 -68l206 -207q27 -27 68 -27q40 0 68 26
+l147 146q28 28 28 67zM753 1025q0 40 -28 68l-206 207q-28 28 -68 28q-39 0 -68 -27l-147 -146q-28 -28 -28 -67q0 -40 28 -68l208 -208q27 -27 68 -27q42 0 72 31q-3 3 -19 18.5t-21.5 21.5t-15 19t-13 25.5t-3.5 27.5q0 40 28 68t68 28q15 0 27.5 -3.5t25.5 -13t19 -15
+t21.5 -21.5t18.5 -19q33 31 33 73zM1648 320q0 -120 -85 -203l-147 -146q-83 -83 -203 -83q-121 0 -204 85l-206 207q-83 83 -83 203q0 123 88 209l-88 88q-86 -88 -208 -88q-120 0 -204 84l-208 208q-84 84 -84 204t85 203l147 146q83 83 203 83q121 0 204 -85l206 -207
+q83 -83 83 -203q0 -123 -88 -209l88 -88q86 88 208 88q120 0 204 -84l208 -208q84 -84 84 -204z" />
+    <glyph glyph-name="cloud" unicode="&#xf0c2;" horiz-adv-x="1920" 
+d="M1920 384q0 -159 -112.5 -271.5t-271.5 -112.5h-1088q-185 0 -316.5 131.5t-131.5 316.5q0 132 71 241.5t187 163.5q-2 28 -2 43q0 212 150 362t362 150q158 0 286.5 -88t187.5 -230q70 62 166 62q106 0 181 -75t75 -181q0 -75 -41 -138q129 -30 213 -134.5t84 -239.5z
+" />
+    <glyph glyph-name="beaker" unicode="&#xf0c3;" horiz-adv-x="1664" 
+d="M1527 88q56 -89 21.5 -152.5t-140.5 -63.5h-1152q-106 0 -140.5 63.5t21.5 152.5l503 793v399h-64q-26 0 -45 19t-19 45t19 45t45 19h512q26 0 45 -19t19 -45t-19 -45t-45 -19h-64v-399zM748 813l-272 -429h712l-272 429l-20 31v37v399h-128v-399v-37z" />
+    <glyph glyph-name="cut" unicode="&#xf0c4;" horiz-adv-x="1792" 
+d="M960 640q26 0 45 -19t19 -45t-19 -45t-45 -19t-45 19t-19 45t19 45t45 19zM1260 576l507 -398q28 -20 25 -56q-5 -35 -35 -51l-128 -64q-13 -7 -29 -7q-17 0 -31 8l-690 387l-110 -66q-8 -4 -12 -5q14 -49 10 -97q-7 -77 -56 -147.5t-132 -123.5q-132 -84 -277 -84
+q-136 0 -222 78q-90 84 -79 207q7 76 56 147t131 124q132 84 278 84q83 0 151 -31q9 13 22 22l122 73l-122 73q-13 9 -22 22q-68 -31 -151 -31q-146 0 -278 84q-82 53 -131 124t-56 147q-5 59 15.5 113t63.5 93q85 79 222 79q145 0 277 -84q83 -52 132 -123t56 -148
+q4 -48 -10 -97q4 -1 12 -5l110 -66l690 387q14 8 31 8q16 0 29 -7l128 -64q30 -16 35 -51q3 -36 -25 -56zM579 836q46 42 21 108t-106 117q-92 59 -192 59q-74 0 -113 -36q-46 -42 -21 -108t106 -117q92 -59 192 -59q74 0 113 36zM494 91q81 51 106 117t-21 108
+q-39 36 -113 36q-100 0 -192 -59q-81 -51 -106 -117t21 -108q39 -36 113 -36q100 0 192 59zM672 704l96 -58v11q0 36 33 56l14 8l-79 47l-26 -26q-3 -3 -10 -11t-12 -12q-2 -2 -4 -3.5t-3 -2.5zM896 480l96 -32l736 576l-128 64l-768 -431v-113l-160 -96l9 -8q2 -2 7 -6
+q4 -4 11 -12t11 -12l26 -26zM1600 64l128 64l-520 408l-177 -138q-2 -3 -13 -7z" />
+    <glyph glyph-name="copy" unicode="&#xf0c5;" horiz-adv-x="1792" 
+d="M1696 1152q40 0 68 -28t28 -68v-1216q0 -40 -28 -68t-68 -28h-960q-40 0 -68 28t-28 68v288h-544q-40 0 -68 28t-28 68v672q0 40 20 88t48 76l408 408q28 28 76 48t88 20h416q40 0 68 -28t28 -68v-328q68 40 128 40h416zM1152 939l-299 -299h299v299zM512 1323l-299 -299
+h299v299zM708 676l316 316v416h-384v-416q0 -40 -28 -68t-68 -28h-416v-640h512v256q0 40 20 88t48 76zM1664 -128v1152h-384v-416q0 -40 -28 -68t-68 -28h-416v-640h896z" />
+    <glyph glyph-name="paper_clip" unicode="&#xf0c6;" horiz-adv-x="1408" 
+d="M1404 151q0 -117 -79 -196t-196 -79q-135 0 -235 100l-777 776q-113 115 -113 271q0 159 110 270t269 111q158 0 273 -113l605 -606q10 -10 10 -22q0 -16 -30.5 -46.5t-46.5 -30.5q-13 0 -23 10l-606 607q-79 77 -181 77q-106 0 -179 -75t-73 -181q0 -105 76 -181
+l776 -777q63 -63 145 -63q64 0 106 42t42 106q0 82 -63 145l-581 581q-26 24 -60 24q-29 0 -48 -19t-19 -48q0 -32 25 -59l410 -410q10 -10 10 -22q0 -16 -31 -47t-47 -31q-12 0 -22 10l-410 410q-63 61 -63 149q0 82 57 139t139 57q88 0 149 -63l581 -581q100 -98 100 -235
+z" />
+    <glyph glyph-name="save" unicode="&#xf0c7;" 
+d="M384 0h768v384h-768v-384zM1280 0h128v896q0 14 -10 38.5t-20 34.5l-281 281q-10 10 -34 20t-39 10v-416q0 -40 -28 -68t-68 -28h-576q-40 0 -68 28t-28 68v416h-128v-1280h128v416q0 40 28 68t68 28h832q40 0 68 -28t28 -68v-416zM896 928v320q0 13 -9.5 22.5t-22.5 9.5
+h-192q-13 0 -22.5 -9.5t-9.5 -22.5v-320q0 -13 9.5 -22.5t22.5 -9.5h192q13 0 22.5 9.5t9.5 22.5zM1536 896v-928q0 -40 -28 -68t-68 -28h-1344q-40 0 -68 28t-28 68v1344q0 40 28 68t68 28h928q40 0 88 -20t76 -48l280 -280q28 -28 48 -76t20 -88z" />
+    <glyph glyph-name="sign_blank" unicode="&#xf0c8;" 
+d="M1536 1120v-960q0 -119 -84.5 -203.5t-203.5 -84.5h-960q-119 0 -203.5 84.5t-84.5 203.5v960q0 119 84.5 203.5t203.5 84.5h960q119 0 203.5 -84.5t84.5 -203.5z" />
+    <glyph glyph-name="reorder" unicode="&#xf0c9;" 
+d="M1536 192v-128q0 -26 -19 -45t-45 -19h-1408q-26 0 -45 19t-19 45v128q0 26 19 45t45 19h1408q26 0 45 -19t19 -45zM1536 704v-128q0 -26 -19 -45t-45 -19h-1408q-26 0 -45 19t-19 45v128q0 26 19 45t45 19h1408q26 0 45 -19t19 -45zM1536 1216v-128q0 -26 -19 -45
+t-45 -19h-1408q-26 0 -45 19t-19 45v128q0 26 19 45t45 19h1408q26 0 45 -19t19 -45z" />
+    <glyph glyph-name="ul" unicode="&#xf0ca;" horiz-adv-x="1792" 
+d="M384 128q0 -80 -56 -136t-136 -56t-136 56t-56 136t56 136t136 56t136 -56t56 -136zM384 640q0 -80 -56 -136t-136 -56t-136 56t-56 136t56 136t136 56t136 -56t56 -136zM1792 224v-192q0 -13 -9.5 -22.5t-22.5 -9.5h-1216q-13 0 -22.5 9.5t-9.5 22.5v192q0 13 9.5 22.5
+t22.5 9.5h1216q13 0 22.5 -9.5t9.5 -22.5zM384 1152q0 -80 -56 -136t-136 -56t-136 56t-56 136t56 136t136 56t136 -56t56 -136zM1792 736v-192q0 -13 -9.5 -22.5t-22.5 -9.5h-1216q-13 0 -22.5 9.5t-9.5 22.5v192q0 13 9.5 22.5t22.5 9.5h1216q13 0 22.5 -9.5t9.5 -22.5z
+M1792 1248v-192q0 -13 -9.5 -22.5t-22.5 -9.5h-1216q-13 0 -22.5 9.5t-9.5 22.5v192q0 13 9.5 22.5t22.5 9.5h1216q13 0 22.5 -9.5t9.5 -22.5z" />
+    <glyph glyph-name="ol" unicode="&#xf0cb;" horiz-adv-x="1792" 
+d="M381 -84q0 -80 -54.5 -126t-135.5 -46q-106 0 -172 66l57 88q49 -45 106 -45q29 0 50.5 14.5t21.5 42.5q0 64 -105 56l-26 56q8 10 32.5 43.5t42.5 54t37 38.5v1q-16 0 -48.5 -1t-48.5 -1v-53h-106v152h333v-88l-95 -115q51 -12 81 -49t30 -88zM383 543v-159h-362
+q-6 36 -6 54q0 51 23.5 93t56.5 68t66 47.5t56.5 43.5t23.5 45q0 25 -14.5 38.5t-39.5 13.5q-46 0 -81 -58l-85 59q24 51 71.5 79.5t105.5 28.5q73 0 123 -41.5t50 -112.5q0 -50 -34 -91.5t-75 -64.5t-75.5 -50.5t-35.5 -52.5h127v60h105zM1792 224v-192q0 -13 -9.5 -22.5
+t-22.5 -9.5h-1216q-13 0 -22.5 9.5t-9.5 22.5v192q0 14 9 23t23 9h1216q13 0 22.5 -9.5t9.5 -22.5zM384 1123v-99h-335v99h107q0 41 0.5 121.5t0.5 121.5v12h-2q-8 -17 -50 -54l-71 76l136 127h106v-404h108zM1792 736v-192q0 -13 -9.5 -22.5t-22.5 -9.5h-1216
+q-13 0 -22.5 9.5t-9.5 22.5v192q0 14 9 23t23 9h1216q13 0 22.5 -9.5t9.5 -22.5zM1792 1248v-192q0 -13 -9.5 -22.5t-22.5 -9.5h-1216q-13 0 -22.5 9.5t-9.5 22.5v192q0 13 9.5 22.5t22.5 9.5h1216q13 0 22.5 -9.5t9.5 -22.5z" />
+    <glyph glyph-name="strikethrough" unicode="&#xf0cc;" horiz-adv-x="1792" 
+d="M1760 640q14 0 23 -9t9 -23v-64q0 -14 -9 -23t-23 -9h-1728q-14 0 -23 9t-9 23v64q0 14 9 23t23 9h1728zM483 704q-28 35 -51 80q-48 98 -48 188q0 181 134 309q133 127 393 127q50 0 167 -19q66 -12 177 -48q10 -38 21 -118q14 -123 14 -183q0 -18 -5 -45l-12 -3l-84 6
+l-14 2q-50 149 -103 205q-88 91 -210 91q-114 0 -182 -59q-67 -58 -67 -146q0 -73 66 -140t279 -129q69 -20 173 -66q58 -28 95 -52h-743zM990 448h411q7 -39 7 -92q0 -111 -41 -212q-23 -56 -71 -104q-37 -35 -109 -81q-80 -48 -153 -66q-80 -21 -203 -21q-114 0 -195 23
+l-140 40q-57 16 -72 28q-8 8 -8 22v13q0 108 -2 156q-1 30 0 68l2 37v44l102 2q15 -34 30 -71t22.5 -56t12.5 -27q35 -57 80 -94q43 -36 105 -57q59 -22 132 -22q64 0 139 27q77 26 122 86q47 61 47 129q0 84 -81 157q-34 29 -137 71z" />
+    <glyph glyph-name="underline" unicode="&#xf0cd;" 
+d="M48 1313q-37 2 -45 4l-3 88q13 1 40 1q60 0 112 -4q132 -7 166 -7q86 0 168 3q116 4 146 5q56 0 86 2l-1 -14l2 -64v-9q-60 -9 -124 -9q-60 0 -79 -25q-13 -14 -13 -132q0 -13 0.5 -32.5t0.5 -25.5l1 -229l14 -280q6 -124 51 -202q35 -59 96 -92q88 -47 177 -47
+q104 0 191 28q56 18 99 51q48 36 65 64q36 56 53 114q21 73 21 229q0 79 -3.5 128t-11 122.5t-13.5 159.5l-4 59q-5 67 -24 88q-34 35 -77 34l-100 -2l-14 3l2 86h84l205 -10q76 -3 196 10l18 -2q6 -38 6 -51q0 -7 -4 -31q-45 -12 -84 -13q-73 -11 -79 -17q-15 -15 -15 -41
+q0 -7 1.5 -27t1.5 -31q8 -19 22 -396q6 -195 -15 -304q-15 -76 -41 -122q-38 -65 -112 -123q-75 -57 -182 -89q-109 -33 -255 -33q-167 0 -284 46q-119 47 -179 122q-61 76 -83 195q-16 80 -16 237v333q0 188 -17 213q-25 36 -147 39zM1536 -96v64q0 14 -9 23t-23 9h-1472
+q-14 0 -23 -9t-9 -23v-64q0 -14 9 -23t23 -9h1472q14 0 23 9t9 23z" />
+    <glyph glyph-name="table" unicode="&#xf0ce;" horiz-adv-x="1664" 
+d="M512 160v192q0 14 -9 23t-23 9h-320q-14 0 -23 -9t-9 -23v-192q0 -14 9 -23t23 -9h320q14 0 23 9t9 23zM512 544v192q0 14 -9 23t-23 9h-320q-14 0 -23 -9t-9 -23v-192q0 -14 9 -23t23 -9h320q14 0 23 9t9 23zM1024 160v192q0 14 -9 23t-23 9h-320q-14 0 -23 -9t-9 -23
+v-192q0 -14 9 -23t23 -9h320q14 0 23 9t9 23zM512 928v192q0 14 -9 23t-23 9h-320q-14 0 -23 -9t-9 -23v-192q0 -14 9 -23t23 -9h320q14 0 23 9t9 23zM1024 544v192q0 14 -9 23t-23 9h-320q-14 0 -23 -9t-9 -23v-192q0 -14 9 -23t23 -9h320q14 0 23 9t9 23zM1536 160v192
+q0 14 -9 23t-23 9h-320q-14 0 -23 -9t-9 -23v-192q0 -14 9 -23t23 -9h320q14 0 23 9t9 23zM1024 928v192q0 14 -9 23t-23 9h-320q-14 0 -23 -9t-9 -23v-192q0 -14 9 -23t23 -9h320q14 0 23 9t9 23zM1536 544v192q0 14 -9 23t-23 9h-320q-14 0 -23 -9t-9 -23v-192
+q0 -14 9 -23t23 -9h320q14 0 23 9t9 23zM1536 928v192q0 14 -9 23t-23 9h-320q-14 0 -23 -9t-9 -23v-192q0 -14 9 -23t23 -9h320q14 0 23 9t9 23zM1664 1248v-1088q0 -66 -47 -113t-113 -47h-1344q-66 0 -113 47t-47 113v1088q0 66 47 113t113 47h1344q66 0 113 -47t47 -113
+z" />
+    <glyph glyph-name="magic" unicode="&#xf0d0;" horiz-adv-x="1664" 
+d="M1190 955l293 293l-107 107l-293 -293zM1637 1248q0 -27 -18 -45l-1286 -1286q-18 -18 -45 -18t-45 18l-198 198q-18 18 -18 45t18 45l1286 1286q18 18 45 18t45 -18l198 -198q18 -18 18 -45zM286 1438l98 -30l-98 -30l-30 -98l-30 98l-98 30l98 30l30 98zM636 1276
+l196 -60l-196 -60l-60 -196l-60 196l-196 60l196 60l60 196zM1566 798l98 -30l-98 -30l-30 -98l-30 98l-98 30l98 30l30 98zM926 1438l98 -30l-98 -30l-30 -98l-30 98l-98 30l98 30l30 98z" />
+    <glyph glyph-name="truck" unicode="&#xf0d1;" horiz-adv-x="1792" 
+d="M640 128q0 52 -38 90t-90 38t-90 -38t-38 -90t38 -90t90 -38t90 38t38 90zM256 640h384v256h-158q-13 0 -22 -9l-195 -195q-9 -9 -9 -22v-30zM1536 128q0 52 -38 90t-90 38t-90 -38t-38 -90t38 -90t90 -38t90 38t38 90zM1792 1216v-1024q0 -15 -4 -26.5t-13.5 -18.5
+t-16.5 -11.5t-23.5 -6t-22.5 -2t-25.5 0t-22.5 0.5q0 -106 -75 -181t-181 -75t-181 75t-75 181h-384q0 -106 -75 -181t-181 -75t-181 75t-75 181h-64q-3 0 -22.5 -0.5t-25.5 0t-22.5 2t-23.5 6t-16.5 11.5t-13.5 18.5t-4 26.5q0 26 19 45t45 19v320q0 8 -0.5 35t0 38
+t2.5 34.5t6.5 37t14 30.5t22.5 30l198 198q19 19 50.5 32t58.5 13h160v192q0 26 19 45t45 19h1024q26 0 45 -19t19 -45z" />
+    <glyph glyph-name="pinterest" unicode="&#xf0d2;" 
+d="M1536 640q0 -209 -103 -385.5t-279.5 -279.5t-385.5 -103q-111 0 -218 32q59 93 78 164q9 34 54 211q20 -39 73 -67.5t114 -28.5q121 0 216 68.5t147 188.5t52 270q0 114 -59.5 214t-172.5 163t-255 63q-105 0 -196 -29t-154.5 -77t-109 -110.5t-67 -129.5t-21.5 -134
+q0 -104 40 -183t117 -111q30 -12 38 20q2 7 8 31t8 30q6 23 -11 43q-51 61 -51 151q0 151 104.5 259.5t273.5 108.5q151 0 235.5 -82t84.5 -213q0 -170 -68.5 -289t-175.5 -119q-61 0 -98 43.5t-23 104.5q8 35 26.5 93.5t30 103t11.5 75.5q0 50 -27 83t-77 33
+q-62 0 -105 -57t-43 -142q0 -73 25 -122l-99 -418q-17 -70 -13 -177q-206 91 -333 281t-127 423q0 209 103 385.5t279.5 279.5t385.5 103t385.5 -103t279.5 -279.5t103 -385.5z" />
+    <glyph glyph-name="pinterest_sign" unicode="&#xf0d3;" 
+d="M1248 1408q119 0 203.5 -84.5t84.5 -203.5v-960q0 -119 -84.5 -203.5t-203.5 -84.5h-725q85 122 108 210q9 34 53 209q21 -39 73.5 -67t112.5 -28q181 0 295.5 147.5t114.5 373.5q0 84 -35 162.5t-96.5 139t-152.5 97t-197 36.5q-104 0 -194.5 -28.5t-153 -76.5
+t-107.5 -109.5t-66.5 -128t-21.5 -132.5q0 -102 39.5 -180t116.5 -110q13 -5 23.5 0t14.5 19q10 44 15 61q6 23 -11 42q-50 62 -50 150q0 150 103.5 256.5t270.5 106.5q149 0 232.5 -81t83.5 -210q0 -168 -67.5 -286t-173.5 -118q-60 0 -97 43.5t-23 103.5q8 34 26.5 92.5
+t29.5 102t11 74.5q0 49 -26.5 81.5t-75.5 32.5q-61 0 -103.5 -56.5t-42.5 -139.5q0 -72 24 -121l-98 -414q-24 -100 -7 -254h-183q-119 0 -203.5 84.5t-84.5 203.5v960q0 119 84.5 203.5t203.5 84.5h960z" />
+    <glyph glyph-name="google_plus_sign" unicode="&#xf0d4;" 
+d="M917 631q0 26 -6 64h-362v-132h217q-3 -24 -16.5 -50t-37.5 -53t-66.5 -44.5t-96.5 -17.5q-99 0 -169 71t-70 171t70 171t169 71q92 0 153 -59l104 101q-108 100 -257 100q-160 0 -272 -112.5t-112 -271.5t112 -271.5t272 -112.5q165 0 266.5 105t101.5 270zM1262 585
+h109v110h-109v110h-110v-110h-110v-110h110v-110h110v110zM1536 1120v-960q0 -119 -84.5 -203.5t-203.5 -84.5h-960q-119 0 -203.5 84.5t-84.5 203.5v960q0 119 84.5 203.5t203.5 84.5h960q119 0 203.5 -84.5t84.5 -203.5z" />
+    <glyph glyph-name="google_plus" unicode="&#xf0d5;" horiz-adv-x="2304" 
+d="M1437 623q0 -208 -87 -370.5t-248 -254t-369 -91.5q-149 0 -285 58t-234 156t-156 234t-58 285t58 285t156 234t234 156t285 58q286 0 491 -192l-199 -191q-117 113 -292 113q-123 0 -227.5 -62t-165.5 -168.5t-61 -232.5t61 -232.5t165.5 -168.5t227.5 -62
+q83 0 152.5 23t114.5 57.5t78.5 78.5t49 83t21.5 74h-416v252h692q12 -63 12 -122zM2304 745v-210h-209v-209h-210v209h-209v210h209v209h210v-209h209z" />
+    <glyph glyph-name="money" unicode="&#xf0d6;" horiz-adv-x="1920" 
+d="M768 384h384v96h-128v448h-114l-148 -137l77 -80q42 37 55 57h2v-288h-128v-96zM1280 640q0 -70 -21 -142t-59.5 -134t-101.5 -101t-138 -39t-138 39t-101.5 101t-59.5 134t-21 142t21 142t59.5 134t101.5 101t138 39t138 -39t101.5 -101t59.5 -134t21 -142zM1792 384
+v512q-106 0 -181 75t-75 181h-1152q0 -106 -75 -181t-181 -75v-512q106 0 181 -75t75 -181h1152q0 106 75 181t181 75zM1920 1216v-1152q0 -26 -19 -45t-45 -19h-1792q-26 0 -45 19t-19 45v1152q0 26 19 45t45 19h1792q26 0 45 -19t19 -45z" />
+    <glyph glyph-name="caret_down" unicode="&#xf0d7;" horiz-adv-x="1024" 
+d="M1024 832q0 -26 -19 -45l-448 -448q-19 -19 -45 -19t-45 19l-448 448q-19 19 -19 45t19 45t45 19h896q26 0 45 -19t19 -45z" />
+    <glyph glyph-name="caret_up" unicode="&#xf0d8;" horiz-adv-x="1024" 
+d="M1024 320q0 -26 -19 -45t-45 -19h-896q-26 0 -45 19t-19 45t19 45l448 448q19 19 45 19t45 -19l448 -448q19 -19 19 -45z" />
+    <glyph glyph-name="caret_left" unicode="&#xf0d9;" horiz-adv-x="640" 
+d="M640 1088v-896q0 -26 -19 -45t-45 -19t-45 19l-448 448q-19 19 -19 45t19 45l448 448q19 19 45 19t45 -19t19 -45z" />
+    <glyph glyph-name="caret_right" unicode="&#xf0da;" horiz-adv-x="640" 
+d="M576 640q0 -26 -19 -45l-448 -448q-19 -19 -45 -19t-45 19t-19 45v896q0 26 19 45t45 19t45 -19l448 -448q19 -19 19 -45z" />
+    <glyph glyph-name="columns" unicode="&#xf0db;" horiz-adv-x="1664" 
+d="M160 0h608v1152h-640v-1120q0 -13 9.5 -22.5t22.5 -9.5zM1536 32v1120h-640v-1152h608q13 0 22.5 9.5t9.5 22.5zM1664 1248v-1216q0 -66 -47 -113t-113 -47h-1344q-66 0 -113 47t-47 113v1216q0 66 47 113t113 47h1344q66 0 113 -47t47 -113z" />
+    <glyph glyph-name="sort" unicode="&#xf0dc;" horiz-adv-x="1024" 
+d="M1024 448q0 -26 -19 -45l-448 -448q-19 -19 -45 -19t-45 19l-448 448q-19 19 -19 45t19 45t45 19h896q26 0 45 -19t19 -45zM1024 832q0 -26 -19 -45t-45 -19h-896q-26 0 -45 19t-19 45t19 45l448 448q19 19 45 19t45 -19l448 -448q19 -19 19 -45z" />
+    <glyph glyph-name="sort_down" unicode="&#xf0dd;" horiz-adv-x="1024" 
+d="M1024 448q0 -26 -19 -45l-448 -448q-19 -19 -45 -19t-45 19l-448 448q-19 19 -19 45t19 45t45 19h896q26 0 45 -19t19 -45z" />
+    <glyph glyph-name="sort_up" unicode="&#xf0de;" horiz-adv-x="1024" 
+d="M1024 832q0 -26 -19 -45t-45 -19h-896q-26 0 -45 19t-19 45t19 45l448 448q19 19 45 19t45 -19l448 -448q19 -19 19 -45z" />
+    <glyph glyph-name="envelope_alt" unicode="&#xf0e0;" horiz-adv-x="1792" 
+d="M1792 826v-794q0 -66 -47 -113t-113 -47h-1472q-66 0 -113 47t-47 113v794q44 -49 101 -87q362 -246 497 -345q57 -42 92.5 -65.5t94.5 -48t110 -24.5h1h1q51 0 110 24.5t94.5 48t92.5 65.5q170 123 498 345q57 39 100 87zM1792 1120q0 -79 -49 -151t-122 -123
+q-376 -261 -468 -325q-10 -7 -42.5 -30.5t-54 -38t-52 -32.5t-57.5 -27t-50 -9h-1h-1q-23 0 -50 9t-57.5 27t-52 32.5t-54 38t-42.5 30.5q-91 64 -262 182.5t-205 142.5q-62 42 -117 115.5t-55 136.5q0 78 41.5 130t118.5 52h1472q65 0 112.5 -47t47.5 -113z" />
+    <glyph glyph-name="linkedin" unicode="&#xf0e1;" 
+d="M349 911v-991h-330v991h330zM370 1217q1 -73 -50.5 -122t-135.5 -49h-2q-82 0 -132 49t-50 122q0 74 51.5 122.5t134.5 48.5t133 -48.5t51 -122.5zM1536 488v-568h-329v530q0 105 -40.5 164.5t-126.5 59.5q-63 0 -105.5 -34.5t-63.5 -85.5q-11 -30 -11 -81v-553h-329
+q2 399 2 647t-1 296l-1 48h329v-144h-2q20 32 41 56t56.5 52t87 43.5t114.5 15.5q171 0 275 -113.5t104 -332.5z" />
+    <glyph glyph-name="undo" unicode="&#xf0e2;" 
+d="M1536 640q0 -156 -61 -298t-164 -245t-245 -164t-298 -61q-172 0 -327 72.5t-264 204.5q-7 10 -6.5 22.5t8.5 20.5l137 138q10 9 25 9q16 -2 23 -12q73 -95 179 -147t225 -52q104 0 198.5 40.5t163.5 109.5t109.5 163.5t40.5 198.5t-40.5 198.5t-109.5 163.5
+t-163.5 109.5t-198.5 40.5q-98 0 -188 -35.5t-160 -101.5l137 -138q31 -30 14 -69q-17 -40 -59 -40h-448q-26 0 -45 19t-19 45v448q0 42 40 59q39 17 69 -14l130 -129q107 101 244.5 156.5t284.5 55.5q156 0 298 -61t245 -164t164 -245t61 -298z" />
+    <glyph glyph-name="legal" unicode="&#xf0e3;" horiz-adv-x="1792" 
+d="M1771 0q0 -53 -37 -90l-107 -108q-39 -37 -91 -37q-53 0 -90 37l-363 364q-38 36 -38 90q0 53 43 96l-256 256l-126 -126q-14 -14 -34 -14t-34 14q2 -2 12.5 -12t12.5 -13t10 -11.5t10 -13.5t6 -13.5t5.5 -16.5t1.5 -18q0 -38 -28 -68q-3 -3 -16.5 -18t-19 -20.5
+t-18.5 -16.5t-22 -15.5t-22 -9t-26 -4.5q-40 0 -68 28l-408 408q-28 28 -28 68q0 13 4.5 26t9 22t15.5 22t16.5 18.5t20.5 19t18 16.5q30 28 68 28q10 0 18 -1.5t16.5 -5.5t13.5 -6t13.5 -10t11.5 -10t13 -12.5t12 -12.5q-14 14 -14 34t14 34l348 348q14 14 34 14t34 -14
+q-2 2 -12.5 12t-12.5 13t-10 11.5t-10 13.5t-6 13.5t-5.5 16.5t-1.5 18q0 38 28 68q3 3 16.5 18t19 20.5t18.5 16.5t22 15.5t22 9t26 4.5q40 0 68 -28l408 -408q28 -28 28 -68q0 -13 -4.5 -26t-9 -22t-15.5 -22t-16.5 -18.5t-20.5 -19t-18 -16.5q-30 -28 -68 -28
+q-10 0 -18 1.5t-16.5 5.5t-13.5 6t-13.5 10t-11.5 10t-13 12.5t-12 12.5q14 -14 14 -34t-14 -34l-126 -126l256 -256q43 43 96 43q52 0 91 -37l363 -363q37 -39 37 -91z" />
+    <glyph glyph-name="dashboard" unicode="&#xf0e4;" horiz-adv-x="1792" 
+d="M384 384q0 53 -37.5 90.5t-90.5 37.5t-90.5 -37.5t-37.5 -90.5t37.5 -90.5t90.5 -37.5t90.5 37.5t37.5 90.5zM576 832q0 53 -37.5 90.5t-90.5 37.5t-90.5 -37.5t-37.5 -90.5t37.5 -90.5t90.5 -37.5t90.5 37.5t37.5 90.5zM1004 351l101 382q6 26 -7.5 48.5t-38.5 29.5
+t-48 -6.5t-30 -39.5l-101 -382q-60 -5 -107 -43.5t-63 -98.5q-20 -77 20 -146t117 -89t146 20t89 117q16 60 -6 117t-72 91zM1664 384q0 53 -37.5 90.5t-90.5 37.5t-90.5 -37.5t-37.5 -90.5t37.5 -90.5t90.5 -37.5t90.5 37.5t37.5 90.5zM1024 1024q0 53 -37.5 90.5
+t-90.5 37.5t-90.5 -37.5t-37.5 -90.5t37.5 -90.5t90.5 -37.5t90.5 37.5t37.5 90.5zM1472 832q0 53 -37.5 90.5t-90.5 37.5t-90.5 -37.5t-37.5 -90.5t37.5 -90.5t90.5 -37.5t90.5 37.5t37.5 90.5zM1792 384q0 -261 -141 -483q-19 -29 -54 -29h-1402q-35 0 -54 29
+q-141 221 -141 483q0 182 71 348t191 286t286 191t348 71t348 -71t286 -191t191 -286t71 -348z" />
+    <glyph glyph-name="comment_alt" unicode="&#xf0e5;" horiz-adv-x="1792" 
+d="M896 1152q-204 0 -381.5 -69.5t-282 -187.5t-104.5 -255q0 -112 71.5 -213.5t201.5 -175.5l87 -50l-27 -96q-24 -91 -70 -172q152 63 275 171l43 38l57 -6q69 -8 130 -8q204 0 381.5 69.5t282 187.5t104.5 255t-104.5 255t-282 187.5t-381.5 69.5zM1792 640
+q0 -174 -120 -321.5t-326 -233t-450 -85.5q-70 0 -145 8q-198 -175 -460 -242q-49 -14 -114 -22h-5q-15 0 -27 10.5t-16 27.5v1q-3 4 -0.5 12t2 10t4.5 9.5l6 9t7 8.5t8 9q7 8 31 34.5t34.5 38t31 39.5t32.5 51t27 59t26 76q-157 89 -247.5 220t-90.5 281q0 174 120 321.5
+t326 233t450 85.5t450 -85.5t326 -233t120 -321.5z" />
+    <glyph glyph-name="comments_alt" unicode="&#xf0e6;" horiz-adv-x="1792" 
+d="M704 1152q-153 0 -286 -52t-211.5 -141t-78.5 -191q0 -82 53 -158t149 -132l97 -56l-35 -84q34 20 62 39l44 31l53 -10q78 -14 153 -14q153 0 286 52t211.5 141t78.5 191t-78.5 191t-211.5 141t-286 52zM704 1280q191 0 353.5 -68.5t256.5 -186.5t94 -257t-94 -257
+t-256.5 -186.5t-353.5 -68.5q-86 0 -176 16q-124 -88 -278 -128q-36 -9 -86 -16h-3q-11 0 -20.5 8t-11.5 21q-1 3 -1 6.5t0.5 6.5t2 6l2.5 5t3.5 5.5t4 5t4.5 5t4 4.5q5 6 23 25t26 29.5t22.5 29t25 38.5t20.5 44q-124 72 -195 177t-71 224q0 139 94 257t256.5 186.5
+t353.5 68.5zM1526 111q10 -24 20.5 -44t25 -38.5t22.5 -29t26 -29.5t23 -25q1 -1 4 -4.5t4.5 -5t4 -5t3.5 -5.5l2.5 -5t2 -6t0.5 -6.5t-1 -6.5q-3 -14 -13 -22t-22 -7q-50 7 -86 16q-154 40 -278 128q-90 -16 -176 -16q-271 0 -472 132q58 -4 88 -4q161 0 309 45t264 129
+q125 92 192 212t67 254q0 77 -23 152q129 -71 204 -178t75 -230q0 -120 -71 -224.5t-195 -176.5z" />
+    <glyph glyph-name="bolt" unicode="&#xf0e7;" horiz-adv-x="896" 
+d="M885 970q18 -20 7 -44l-540 -1157q-13 -25 -42 -25q-4 0 -14 2q-17 5 -25.5 19t-4.5 30l197 808l-406 -101q-4 -1 -12 -1q-18 0 -31 11q-18 15 -13 39l201 825q4 14 16 23t28 9h328q19 0 32 -12.5t13 -29.5q0 -8 -5 -18l-171 -463l396 98q8 2 12 2q19 0 34 -15z" />
+    <glyph glyph-name="sitemap" unicode="&#xf0e8;" horiz-adv-x="1792" 
+d="M1792 288v-320q0 -40 -28 -68t-68 -28h-320q-40 0 -68 28t-28 68v320q0 40 28 68t68 28h96v192h-512v-192h96q40 0 68 -28t28 -68v-320q0 -40 -28 -68t-68 -28h-320q-40 0 -68 28t-28 68v320q0 40 28 68t68 28h96v192h-512v-192h96q40 0 68 -28t28 -68v-320
+q0 -40 -28 -68t-68 -28h-320q-40 0 -68 28t-28 68v320q0 40 28 68t68 28h96v192q0 52 38 90t90 38h512v192h-96q-40 0 -68 28t-28 68v320q0 40 28 68t68 28h320q40 0 68 -28t28 -68v-320q0 -40 -28 -68t-68 -28h-96v-192h512q52 0 90 -38t38 -90v-192h96q40 0 68 -28t28 -68
+z" />
+    <glyph glyph-name="umbrella" unicode="&#xf0e9;" horiz-adv-x="1664" 
+d="M896 708v-580q0 -104 -76 -180t-180 -76t-180 76t-76 180q0 26 19 45t45 19t45 -19t19 -45q0 -50 39 -89t89 -39t89 39t39 89v580q33 11 64 11t64 -11zM1664 681q0 -13 -9.5 -22.5t-22.5 -9.5q-11 0 -23 10q-49 46 -93 69t-102 23q-68 0 -128 -37t-103 -97
+q-7 -10 -17.5 -28t-14.5 -24q-11 -17 -28 -17q-18 0 -29 17q-4 6 -14.5 24t-17.5 28q-43 60 -102.5 97t-127.5 37t-127.5 -37t-102.5 -97q-7 -10 -17.5 -28t-14.5 -24q-11 -17 -29 -17q-17 0 -28 17q-4 6 -14.5 24t-17.5 28q-43 60 -103 97t-128 37q-58 0 -102 -23t-93 -69
+q-12 -10 -23 -10q-13 0 -22.5 9.5t-9.5 22.5q0 5 1 7q45 183 172.5 319.5t298 204.5t360.5 68q140 0 274.5 -40t246.5 -113.5t194.5 -187t115.5 -251.5q1 -2 1 -7zM896 1408v-98q-42 2 -64 2t-64 -2v98q0 26 19 45t45 19t45 -19t19 -45z" />
+    <glyph glyph-name="paste" unicode="&#xf0ea;" horiz-adv-x="1792" 
+d="M768 -128h896v640h-416q-40 0 -68 28t-28 68v416h-384v-1152zM1024 1312v64q0 13 -9.5 22.5t-22.5 9.5h-704q-13 0 -22.5 -9.5t-9.5 -22.5v-64q0 -13 9.5 -22.5t22.5 -9.5h704q13 0 22.5 9.5t9.5 22.5zM1280 640h299l-299 299v-299zM1792 512v-672q0 -40 -28 -68t-68 -28
+h-960q-40 0 -68 28t-28 68v160h-544q-40 0 -68 28t-28 68v1344q0 40 28 68t68 28h1088q40 0 68 -28t28 -68v-328q21 -13 36 -28l408 -408q28 -28 48 -76t20 -88z" />
+    <glyph glyph-name="light_bulb" unicode="&#xf0eb;" horiz-adv-x="1024" 
+d="M736 960q0 -13 -9.5 -22.5t-22.5 -9.5t-22.5 9.5t-9.5 22.5q0 46 -54 71t-106 25q-13 0 -22.5 9.5t-9.5 22.5t9.5 22.5t22.5 9.5q50 0 99.5 -16t87 -54t37.5 -90zM896 960q0 72 -34.5 134t-90 101.5t-123 62t-136.5 22.5t-136.5 -22.5t-123 -62t-90 -101.5t-34.5 -134
+q0 -101 68 -180q10 -11 30.5 -33t30.5 -33q128 -153 141 -298h228q13 145 141 298q10 11 30.5 33t30.5 33q68 79 68 180zM1024 960q0 -155 -103 -268q-45 -49 -74.5 -87t-59.5 -95.5t-34 -107.5q47 -28 47 -82q0 -37 -25 -64q25 -27 25 -64q0 -52 -45 -81q13 -23 13 -47
+q0 -46 -31.5 -71t-77.5 -25q-20 -44 -60 -70t-87 -26t-87 26t-60 70q-46 0 -77.5 25t-31.5 71q0 24 13 47q-45 29 -45 81q0 37 25 64q-25 27 -25 64q0 54 47 82q-4 50 -34 107.5t-59.5 95.5t-74.5 87q-103 113 -103 268q0 99 44.5 184.5t117 142t164 89t186.5 32.5
+t186.5 -32.5t164 -89t117 -142t44.5 -184.5z" />
+    <glyph glyph-name="exchange" unicode="&#xf0ec;" horiz-adv-x="1792" 
+d="M1792 352v-192q0 -13 -9.5 -22.5t-22.5 -9.5h-1376v-192q0 -13 -9.5 -22.5t-22.5 -9.5q-12 0 -24 10l-319 320q-9 9 -9 22q0 14 9 23l320 320q9 9 23 9q13 0 22.5 -9.5t9.5 -22.5v-192h1376q13 0 22.5 -9.5t9.5 -22.5zM1792 896q0 -14 -9 -23l-320 -320q-9 -9 -23 -9
+q-13 0 -22.5 9.5t-9.5 22.5v192h-1376q-13 0 -22.5 9.5t-9.5 22.5v192q0 13 9.5 22.5t22.5 9.5h1376v192q0 14 9 23t23 9q12 0 24 -10l319 -319q9 -9 9 -23z" />
+    <glyph glyph-name="cloud_download" unicode="&#xf0ed;" horiz-adv-x="1920" 
+d="M1280 608q0 14 -9 23t-23 9h-224v352q0 13 -9.5 22.5t-22.5 9.5h-192q-13 0 -22.5 -9.5t-9.5 -22.5v-352h-224q-13 0 -22.5 -9.5t-9.5 -22.5q0 -14 9 -23l352 -352q9 -9 23 -9t23 9l351 351q10 12 10 24zM1920 384q0 -159 -112.5 -271.5t-271.5 -112.5h-1088
+q-185 0 -316.5 131.5t-131.5 316.5q0 130 70 240t188 165q-2 30 -2 43q0 212 150 362t362 150q156 0 285.5 -87t188.5 -231q71 62 166 62q106 0 181 -75t75 -181q0 -76 -41 -138q130 -31 213.5 -135.5t83.5 -238.5z" />
+    <glyph glyph-name="cloud_upload" unicode="&#xf0ee;" horiz-adv-x="1920" 
+d="M1280 672q0 14 -9 23l-352 352q-9 9 -23 9t-23 -9l-351 -351q-10 -12 -10 -24q0 -14 9 -23t23 -9h224v-352q0 -13 9.5 -22.5t22.5 -9.5h192q13 0 22.5 9.5t9.5 22.5v352h224q13 0 22.5 9.5t9.5 22.5zM1920 384q0 -159 -112.5 -271.5t-271.5 -112.5h-1088
+q-185 0 -316.5 131.5t-131.5 316.5q0 130 70 240t188 165q-2 30 -2 43q0 212 150 362t362 150q156 0 285.5 -87t188.5 -231q71 62 166 62q106 0 181 -75t75 -181q0 -76 -41 -138q130 -31 213.5 -135.5t83.5 -238.5z" />
+    <glyph glyph-name="user_md" unicode="&#xf0f0;" horiz-adv-x="1408" 
+d="M384 192q0 -26 -19 -45t-45 -19t-45 19t-19 45t19 45t45 19t45 -19t19 -45zM1408 131q0 -121 -73 -190t-194 -69h-874q-121 0 -194 69t-73 190q0 68 5.5 131t24 138t47.5 132.5t81 103t120 60.5q-22 -52 -22 -120v-203q-58 -20 -93 -70t-35 -111q0 -80 56 -136t136 -56
+t136 56t56 136q0 61 -35.5 111t-92.5 70v203q0 62 25 93q132 -104 295 -104t295 104q25 -31 25 -93v-64q-106 0 -181 -75t-75 -181v-89q-32 -29 -32 -71q0 -40 28 -68t68 -28t68 28t28 68q0 42 -32 71v89q0 52 38 90t90 38t90 -38t38 -90v-89q-32 -29 -32 -71q0 -40 28 -68
+t68 -28t68 28t28 68q0 42 -32 71v89q0 68 -34.5 127.5t-93.5 93.5q0 10 0.5 42.5t0 48t-2.5 41.5t-7 47t-13 40q68 -15 120 -60.5t81 -103t47.5 -132.5t24 -138t5.5 -131zM1088 1024q0 -159 -112.5 -271.5t-271.5 -112.5t-271.5 112.5t-112.5 271.5t112.5 271.5t271.5 112.5
+t271.5 -112.5t112.5 -271.5z" />
+    <glyph glyph-name="stethoscope" unicode="&#xf0f1;" horiz-adv-x="1408" 
+d="M1280 832q0 26 -19 45t-45 19t-45 -19t-19 -45t19 -45t45 -19t45 19t19 45zM1408 832q0 -62 -35.5 -111t-92.5 -70v-395q0 -159 -131.5 -271.5t-316.5 -112.5t-316.5 112.5t-131.5 271.5v132q-164 20 -274 128t-110 252v512q0 26 19 45t45 19q6 0 16 -2q17 30 47 48
+t65 18q53 0 90.5 -37.5t37.5 -90.5t-37.5 -90.5t-90.5 -37.5q-33 0 -64 18v-402q0 -106 94 -181t226 -75t226 75t94 181v402q-31 -18 -64 -18q-53 0 -90.5 37.5t-37.5 90.5t37.5 90.5t90.5 37.5q35 0 65 -18t47 -48q10 2 16 2q26 0 45 -19t19 -45v-512q0 -144 -110 -252
+t-274 -128v-132q0 -106 94 -181t226 -75t226 75t94 181v395q-57 21 -92.5 70t-35.5 111q0 80 56 136t136 56t136 -56t56 -136z" />
+    <glyph glyph-name="suitcase" unicode="&#xf0f2;" horiz-adv-x="1792" 
+d="M640 1152h512v128h-512v-128zM288 1152v-1280h-64q-92 0 -158 66t-66 158v832q0 92 66 158t158 66h64zM1408 1152v-1280h-1024v1280h128v160q0 40 28 68t68 28h576q40 0 68 -28t28 -68v-160h128zM1792 928v-832q0 -92 -66 -158t-158 -66h-64v1280h64q92 0 158 -66
+t66 -158z" />
+    <glyph glyph-name="bell_alt" unicode="&#xf0f3;" horiz-adv-x="1792" 
+d="M912 -160q0 16 -16 16q-59 0 -101.5 42.5t-42.5 101.5q0 16 -16 16t-16 -16q0 -73 51.5 -124.5t124.5 -51.5q16 0 16 16zM1728 128q0 -52 -38 -90t-90 -38h-448q0 -106 -75 -181t-181 -75t-181 75t-75 181h-448q-52 0 -90 38t-38 90q50 42 91 88t85 119.5t74.5 158.5
+t50 206t19.5 260q0 152 117 282.5t307 158.5q-8 19 -8 39q0 40 28 68t68 28t68 -28t28 -68q0 -20 -8 -39q190 -28 307 -158.5t117 -282.5q0 -139 19.5 -260t50 -206t74.5 -158.5t85 -119.5t91 -88z" />
+    <glyph glyph-name="coffee" unicode="&#xf0f4;" horiz-adv-x="1920" 
+d="M1664 896q0 80 -56 136t-136 56h-64v-384h64q80 0 136 56t56 136zM0 128h1792q0 -106 -75 -181t-181 -75h-1280q-106 0 -181 75t-75 181zM1856 896q0 -159 -112.5 -271.5t-271.5 -112.5h-64v-32q0 -92 -66 -158t-158 -66h-704q-92 0 -158 66t-66 158v736q0 26 19 45
+t45 19h1152q159 0 271.5 -112.5t112.5 -271.5z" />
+    <glyph glyph-name="food" unicode="&#xf0f5;" horiz-adv-x="1408" 
+d="M640 1472v-640q0 -61 -35.5 -111t-92.5 -70v-779q0 -52 -38 -90t-90 -38h-128q-52 0 -90 38t-38 90v779q-57 20 -92.5 70t-35.5 111v640q0 26 19 45t45 19t45 -19t19 -45v-416q0 -26 19 -45t45 -19t45 19t19 45v416q0 26 19 45t45 19t45 -19t19 -45v-416q0 -26 19 -45
+t45 -19t45 19t19 45v416q0 26 19 45t45 19t45 -19t19 -45zM1408 1472v-1600q0 -52 -38 -90t-90 -38h-128q-52 0 -90 38t-38 90v512h-224q-13 0 -22.5 9.5t-9.5 22.5v800q0 132 94 226t226 94h256q26 0 45 -19t19 -45z" />
+    <glyph glyph-name="file_text_alt" unicode="&#xf0f6;" 
+d="M1468 1156q28 -28 48 -76t20 -88v-1152q0 -40 -28 -68t-68 -28h-1344q-40 0 -68 28t-28 68v1600q0 40 28 68t68 28h896q40 0 88 -20t76 -48zM1024 1400v-376h376q-10 29 -22 41l-313 313q-12 12 -41 22zM1408 -128v1024h-416q-40 0 -68 28t-28 68v416h-768v-1536h1280z
+M384 736q0 14 9 23t23 9h704q14 0 23 -9t9 -23v-64q0 -14 -9 -23t-23 -9h-704q-14 0 -23 9t-9 23v64zM1120 512q14 0 23 -9t9 -23v-64q0 -14 -9 -23t-23 -9h-704q-14 0 -23 9t-9 23v64q0 14 9 23t23 9h704zM1120 256q14 0 23 -9t9 -23v-64q0 -14 -9 -23t-23 -9h-704
+q-14 0 -23 9t-9 23v64q0 14 9 23t23 9h704z" />
+    <glyph glyph-name="building" unicode="&#xf0f7;" horiz-adv-x="1408" 
+d="M384 224v-64q0 -13 -9.5 -22.5t-22.5 -9.5h-64q-13 0 -22.5 9.5t-9.5 22.5v64q0 13 9.5 22.5t22.5 9.5h64q13 0 22.5 -9.5t9.5 -22.5zM384 480v-64q0 -13 -9.5 -22.5t-22.5 -9.5h-64q-13 0 -22.5 9.5t-9.5 22.5v64q0 13 9.5 22.5t22.5 9.5h64q13 0 22.5 -9.5t9.5 -22.5z
+M640 480v-64q0 -13 -9.5 -22.5t-22.5 -9.5h-64q-13 0 -22.5 9.5t-9.5 22.5v64q0 13 9.5 22.5t22.5 9.5h64q13 0 22.5 -9.5t9.5 -22.5zM384 736v-64q0 -13 -9.5 -22.5t-22.5 -9.5h-64q-13 0 -22.5 9.5t-9.5 22.5v64q0 13 9.5 22.5t22.5 9.5h64q13 0 22.5 -9.5t9.5 -22.5z
+M1152 224v-64q0 -13 -9.5 -22.5t-22.5 -9.5h-64q-13 0 -22.5 9.5t-9.5 22.5v64q0 13 9.5 22.5t22.5 9.5h64q13 0 22.5 -9.5t9.5 -22.5zM896 480v-64q0 -13 -9.5 -22.5t-22.5 -9.5h-64q-13 0 -22.5 9.5t-9.5 22.5v64q0 13 9.5 22.5t22.5 9.5h64q13 0 22.5 -9.5t9.5 -22.5z
+M640 736v-64q0 -13 -9.5 -22.5t-22.5 -9.5h-64q-13 0 -22.5 9.5t-9.5 22.5v64q0 13 9.5 22.5t22.5 9.5h64q13 0 22.5 -9.5t9.5 -22.5zM384 992v-64q0 -13 -9.5 -22.5t-22.5 -9.5h-64q-13 0 -22.5 9.5t-9.5 22.5v64q0 13 9.5 22.5t22.5 9.5h64q13 0 22.5 -9.5t9.5 -22.5z
+M1152 480v-64q0 -13 -9.5 -22.5t-22.5 -9.5h-64q-13 0 -22.5 9.5t-9.5 22.5v64q0 13 9.5 22.5t22.5 9.5h64q13 0 22.5 -9.5t9.5 -22.5zM896 736v-64q0 -13 -9.5 -22.5t-22.5 -9.5h-64q-13 0 -22.5 9.5t-9.5 22.5v64q0 13 9.5 22.5t22.5 9.5h64q13 0 22.5 -9.5t9.5 -22.5z
+M640 992v-64q0 -13 -9.5 -22.5t-22.5 -9.5h-64q-13 0 -22.5 9.5t-9.5 22.5v64q0 13 9.5 22.5t22.5 9.5h64q13 0 22.5 -9.5t9.5 -22.5zM384 1248v-64q0 -13 -9.5 -22.5t-22.5 -9.5h-64q-13 0 -22.5 9.5t-9.5 22.5v64q0 13 9.5 22.5t22.5 9.5h64q13 0 22.5 -9.5t9.5 -22.5z
+M1152 736v-64q0 -13 -9.5 -22.5t-22.5 -9.5h-64q-13 0 -22.5 9.5t-9.5 22.5v64q0 13 9.5 22.5t22.5 9.5h64q13 0 22.5 -9.5t9.5 -22.5zM896 992v-64q0 -13 -9.5 -22.5t-22.5 -9.5h-64q-13 0 -22.5 9.5t-9.5 22.5v64q0 13 9.5 22.5t22.5 9.5h64q13 0 22.5 -9.5t9.5 -22.5z
+M640 1248v-64q0 -13 -9.5 -22.5t-22.5 -9.5h-64q-13 0 -22.5 9.5t-9.5 22.5v64q0 13 9.5 22.5t22.5 9.5h64q13 0 22.5 -9.5t9.5 -22.5zM1152 992v-64q0 -13 -9.5 -22.5t-22.5 -9.5h-64q-13 0 -22.5 9.5t-9.5 22.5v64q0 13 9.5 22.5t22.5 9.5h64q13 0 22.5 -9.5t9.5 -22.5z
+M896 1248v-64q0 -13 -9.5 -22.5t-22.5 -9.5h-64q-13 0 -22.5 9.5t-9.5 22.5v64q0 13 9.5 22.5t22.5 9.5h64q13 0 22.5 -9.5t9.5 -22.5zM1152 1248v-64q0 -13 -9.5 -22.5t-22.5 -9.5h-64q-13 0 -22.5 9.5t-9.5 22.5v64q0 13 9.5 22.5t22.5 9.5h64q13 0 22.5 -9.5t9.5 -22.5z
+M896 -128h384v1536h-1152v-1536h384v224q0 13 9.5 22.5t22.5 9.5h320q13 0 22.5 -9.5t9.5 -22.5v-224zM1408 1472v-1664q0 -26 -19 -45t-45 -19h-1280q-26 0 -45 19t-19 45v1664q0 26 19 45t45 19h1280q26 0 45 -19t19 -45z" />
+    <glyph glyph-name="hospital" unicode="&#xf0f8;" horiz-adv-x="1408" 
+d="M384 224v-64q0 -13 -9.5 -22.5t-22.5 -9.5h-64q-13 0 -22.5 9.5t-9.5 22.5v64q0 13 9.5 22.5t22.5 9.5h64q13 0 22.5 -9.5t9.5 -22.5zM384 480v-64q0 -13 -9.5 -22.5t-22.5 -9.5h-64q-13 0 -22.5 9.5t-9.5 22.5v64q0 13 9.5 22.5t22.5 9.5h64q13 0 22.5 -9.5t9.5 -22.5z
+M640 480v-64q0 -13 -9.5 -22.5t-22.5 -9.5h-64q-13 0 -22.5 9.5t-9.5 22.5v64q0 13 9.5 22.5t22.5 9.5h64q13 0 22.5 -9.5t9.5 -22.5zM384 736v-64q0 -13 -9.5 -22.5t-22.5 -9.5h-64q-13 0 -22.5 9.5t-9.5 22.5v64q0 13 9.5 22.5t22.5 9.5h64q13 0 22.5 -9.5t9.5 -22.5z
+M1152 224v-64q0 -13 -9.5 -22.5t-22.5 -9.5h-64q-13 0 -22.5 9.5t-9.5 22.5v64q0 13 9.5 22.5t22.5 9.5h64q13 0 22.5 -9.5t9.5 -22.5zM896 480v-64q0 -13 -9.5 -22.5t-22.5 -9.5h-64q-13 0 -22.5 9.5t-9.5 22.5v64q0 13 9.5 22.5t22.5 9.5h64q13 0 22.5 -9.5t9.5 -22.5z
+M640 736v-64q0 -13 -9.5 -22.5t-22.5 -9.5h-64q-13 0 -22.5 9.5t-9.5 22.5v64q0 13 9.5 22.5t22.5 9.5h64q13 0 22.5 -9.5t9.5 -22.5zM1152 480v-64q0 -13 -9.5 -22.5t-22.5 -9.5h-64q-13 0 -22.5 9.5t-9.5 22.5v64q0 13 9.5 22.5t22.5 9.5h64q13 0 22.5 -9.5t9.5 -22.5z
+M896 736v-64q0 -13 -9.5 -22.5t-22.5 -9.5h-64q-13 0 -22.5 9.5t-9.5 22.5v64q0 13 9.5 22.5t22.5 9.5h64q13 0 22.5 -9.5t9.5 -22.5zM1152 736v-64q0 -13 -9.5 -22.5t-22.5 -9.5h-64q-13 0 -22.5 9.5t-9.5 22.5v64q0 13 9.5 22.5t22.5 9.5h64q13 0 22.5 -9.5t9.5 -22.5z
+M896 -128h384v1152h-256v-32q0 -40 -28 -68t-68 -28h-448q-40 0 -68 28t-28 68v32h-256v-1152h384v224q0 13 9.5 22.5t22.5 9.5h320q13 0 22.5 -9.5t9.5 -22.5v-224zM896 1056v320q0 13 -9.5 22.5t-22.5 9.5h-64q-13 0 -22.5 -9.5t-9.5 -22.5v-96h-128v96q0 13 -9.5 22.5
+t-22.5 9.5h-64q-13 0 -22.5 -9.5t-9.5 -22.5v-320q0 -13 9.5 -22.5t22.5 -9.5h64q13 0 22.5 9.5t9.5 22.5v96h128v-96q0 -13 9.5 -22.5t22.5 -9.5h64q13 0 22.5 9.5t9.5 22.5zM1408 1088v-1280q0 -26 -19 -45t-45 -19h-1280q-26 0 -45 19t-19 45v1280q0 26 19 45t45 19h320
+v288q0 40 28 68t68 28h448q40 0 68 -28t28 -68v-288h320q26 0 45 -19t19 -45z" />
+    <glyph glyph-name="ambulance" unicode="&#xf0f9;" horiz-adv-x="1920" 
+d="M640 128q0 53 -37.5 90.5t-90.5 37.5t-90.5 -37.5t-37.5 -90.5t37.5 -90.5t90.5 -37.5t90.5 37.5t37.5 90.5zM256 640h384v256h-158q-14 -2 -22 -9l-195 -195q-7 -12 -9 -22v-30zM1536 128q0 53 -37.5 90.5t-90.5 37.5t-90.5 -37.5t-37.5 -90.5t37.5 -90.5t90.5 -37.5
+t90.5 37.5t37.5 90.5zM1664 800v192q0 14 -9 23t-23 9h-224v224q0 14 -9 23t-23 9h-192q-14 0 -23 -9t-9 -23v-224h-224q-14 0 -23 -9t-9 -23v-192q0 -14 9 -23t23 -9h224v-224q0 -14 9 -23t23 -9h192q14 0 23 9t9 23v224h224q14 0 23 9t9 23zM1920 1344v-1152
+q0 -26 -19 -45t-45 -19h-192q0 -106 -75 -181t-181 -75t-181 75t-75 181h-384q0 -106 -75 -181t-181 -75t-181 75t-75 181h-128q-26 0 -45 19t-19 45t19 45t45 19v416q0 26 13 58t32 51l198 198q19 19 51 32t58 13h160v320q0 26 19 45t45 19h1152q26 0 45 -19t19 -45z" />
+    <glyph glyph-name="medkit" unicode="&#xf0fa;" horiz-adv-x="1792" 
+d="M1280 416v192q0 14 -9 23t-23 9h-224v224q0 14 -9 23t-23 9h-192q-14 0 -23 -9t-9 -23v-224h-224q-14 0 -23 -9t-9 -23v-192q0 -14 9 -23t23 -9h224v-224q0 -14 9 -23t23 -9h192q14 0 23 9t9 23v224h224q14 0 23 9t9 23zM640 1152h512v128h-512v-128zM256 1152v-1280h-32
+q-92 0 -158 66t-66 158v832q0 92 66 158t158 66h32zM1440 1152v-1280h-1088v1280h160v160q0 40 28 68t68 28h576q40 0 68 -28t28 -68v-160h160zM1792 928v-832q0 -92 -66 -158t-158 -66h-32v1280h32q92 0 158 -66t66 -158z" />
+    <glyph glyph-name="fighter_jet" unicode="&#xf0fb;" horiz-adv-x="1920" 
+d="M1920 576q-1 -32 -288 -96l-352 -32l-224 -64h-64l-293 -352h69q26 0 45 -4.5t19 -11.5t-19 -11.5t-45 -4.5h-96h-160h-64v32h64v416h-160l-192 -224h-96l-32 32v192h32v32h128v8l-192 24v128l192 24v8h-128v32h-32v192l32 32h96l192 -224h160v416h-64v32h64h160h96
+q26 0 45 -4.5t19 -11.5t-19 -11.5t-45 -4.5h-69l293 -352h64l224 -64l352 -32q128 -28 200 -52t80 -34z" />
+    <glyph glyph-name="beer" unicode="&#xf0fc;" horiz-adv-x="1664" 
+d="M640 640v384h-256v-256q0 -53 37.5 -90.5t90.5 -37.5h128zM1664 192v-192h-1152v192l128 192h-128q-159 0 -271.5 112.5t-112.5 271.5v320l-64 64l32 128h480l32 128h960l32 -192l-64 -32v-800z" />
+    <glyph glyph-name="h_sign" unicode="&#xf0fd;" 
+d="M1280 192v896q0 26 -19 45t-45 19h-128q-26 0 -45 -19t-19 -45v-320h-512v320q0 26 -19 45t-45 19h-128q-26 0 -45 -19t-19 -45v-896q0 -26 19 -45t45 -19h128q26 0 45 19t19 45v320h512v-320q0 -26 19 -45t45 -19h128q26 0 45 19t19 45zM1536 1120v-960
+q0 -119 -84.5 -203.5t-203.5 -84.5h-960q-119 0 -203.5 84.5t-84.5 203.5v960q0 119 84.5 203.5t203.5 84.5h960q119 0 203.5 -84.5t84.5 -203.5z" />
+    <glyph glyph-name="f0fe" unicode="&#xf0fe;" 
+d="M1280 576v128q0 26 -19 45t-45 19h-320v320q0 26 -19 45t-45 19h-128q-26 0 -45 -19t-19 -45v-320h-320q-26 0 -45 -19t-19 -45v-128q0 -26 19 -45t45 -19h320v-320q0 -26 19 -45t45 -19h128q26 0 45 19t19 45v320h320q26 0 45 19t19 45zM1536 1120v-960
+q0 -119 -84.5 -203.5t-203.5 -84.5h-960q-119 0 -203.5 84.5t-84.5 203.5v960q0 119 84.5 203.5t203.5 84.5h960q119 0 203.5 -84.5t84.5 -203.5z" />
+    <glyph glyph-name="double_angle_left" unicode="&#xf100;" horiz-adv-x="1024" 
+d="M627 160q0 -13 -10 -23l-50 -50q-10 -10 -23 -10t-23 10l-466 466q-10 10 -10 23t10 23l466 466q10 10 23 10t23 -10l50 -50q10 -10 10 -23t-10 -23l-393 -393l393 -393q10 -10 10 -23zM1011 160q0 -13 -10 -23l-50 -50q-10 -10 -23 -10t-23 10l-466 466q-10 10 -10 23
+t10 23l466 466q10 10 23 10t23 -10l50 -50q10 -10 10 -23t-10 -23l-393 -393l393 -393q10 -10 10 -23z" />
+    <glyph glyph-name="double_angle_right" unicode="&#xf101;" horiz-adv-x="1024" 
+d="M595 576q0 -13 -10 -23l-466 -466q-10 -10 -23 -10t-23 10l-50 50q-10 10 -10 23t10 23l393 393l-393 393q-10 10 -10 23t10 23l50 50q10 10 23 10t23 -10l466 -466q10 -10 10 -23zM979 576q0 -13 -10 -23l-466 -466q-10 -10 -23 -10t-23 10l-50 50q-10 10 -10 23t10 23
+l393 393l-393 393q-10 10 -10 23t10 23l50 50q10 10 23 10t23 -10l466 -466q10 -10 10 -23z" />
+    <glyph glyph-name="double_angle_up" unicode="&#xf102;" horiz-adv-x="1152" 
+d="M1075 224q0 -13 -10 -23l-50 -50q-10 -10 -23 -10t-23 10l-393 393l-393 -393q-10 -10 -23 -10t-23 10l-50 50q-10 10 -10 23t10 23l466 466q10 10 23 10t23 -10l466 -466q10 -10 10 -23zM1075 608q0 -13 -10 -23l-50 -50q-10 -10 -23 -10t-23 10l-393 393l-393 -393
+q-10 -10 -23 -10t-23 10l-50 50q-10 10 -10 23t10 23l466 466q10 10 23 10t23 -10l466 -466q10 -10 10 -23z" />
+    <glyph glyph-name="double_angle_down" unicode="&#xf103;" horiz-adv-x="1152" 
+d="M1075 672q0 -13 -10 -23l-466 -466q-10 -10 -23 -10t-23 10l-466 466q-10 10 -10 23t10 23l50 50q10 10 23 10t23 -10l393 -393l393 393q10 10 23 10t23 -10l50 -50q10 -10 10 -23zM1075 1056q0 -13 -10 -23l-466 -466q-10 -10 -23 -10t-23 10l-466 466q-10 10 -10 23
+t10 23l50 50q10 10 23 10t23 -10l393 -393l393 393q10 10 23 10t23 -10l50 -50q10 -10 10 -23z" />
+    <glyph glyph-name="angle_left" unicode="&#xf104;" horiz-adv-x="640" 
+d="M627 992q0 -13 -10 -23l-393 -393l393 -393q10 -10 10 -23t-10 -23l-50 -50q-10 -10 -23 -10t-23 10l-466 466q-10 10 -10 23t10 23l466 466q10 10 23 10t23 -10l50 -50q10 -10 10 -23z" />
+    <glyph glyph-name="angle_right" unicode="&#xf105;" horiz-adv-x="640" 
+d="M595 576q0 -13 -10 -23l-466 -466q-10 -10 -23 -10t-23 10l-50 50q-10 10 -10 23t10 23l393 393l-393 393q-10 10 -10 23t10 23l50 50q10 10 23 10t23 -10l466 -466q10 -10 10 -23z" />
+    <glyph glyph-name="angle_up" unicode="&#xf106;" horiz-adv-x="1152" 
+d="M1075 352q0 -13 -10 -23l-50 -50q-10 -10 -23 -10t-23 10l-393 393l-393 -393q-10 -10 -23 -10t-23 10l-50 50q-10 10 -10 23t10 23l466 466q10 10 23 10t23 -10l466 -466q10 -10 10 -23z" />
+    <glyph glyph-name="angle_down" unicode="&#xf107;" horiz-adv-x="1152" 
+d="M1075 800q0 -13 -10 -23l-466 -466q-10 -10 -23 -10t-23 10l-466 466q-10 10 -10 23t10 23l50 50q10 10 23 10t23 -10l393 -393l393 393q10 10 23 10t23 -10l50 -50q10 -10 10 -23z" />
+    <glyph glyph-name="desktop" unicode="&#xf108;" horiz-adv-x="1920" 
+d="M1792 544v832q0 13 -9.5 22.5t-22.5 9.5h-1600q-13 0 -22.5 -9.5t-9.5 -22.5v-832q0 -13 9.5 -22.5t22.5 -9.5h1600q13 0 22.5 9.5t9.5 22.5zM1920 1376v-1088q0 -66 -47 -113t-113 -47h-544q0 -37 16 -77.5t32 -71t16 -43.5q0 -26 -19 -45t-45 -19h-512q-26 0 -45 19
+t-19 45q0 14 16 44t32 70t16 78h-544q-66 0 -113 47t-47 113v1088q0 66 47 113t113 47h1600q66 0 113 -47t47 -113z" />
+    <glyph glyph-name="laptop" unicode="&#xf109;" horiz-adv-x="1920" 
+d="M416 256q-66 0 -113 47t-47 113v704q0 66 47 113t113 47h1088q66 0 113 -47t47 -113v-704q0 -66 -47 -113t-113 -47h-1088zM384 1120v-704q0 -13 9.5 -22.5t22.5 -9.5h1088q13 0 22.5 9.5t9.5 22.5v704q0 13 -9.5 22.5t-22.5 9.5h-1088q-13 0 -22.5 -9.5t-9.5 -22.5z
+M1760 192h160v-96q0 -40 -47 -68t-113 -28h-1600q-66 0 -113 28t-47 68v96h160h1600zM1040 96q16 0 16 16t-16 16h-160q-16 0 -16 -16t16 -16h160z" />
+    <glyph glyph-name="tablet" unicode="&#xf10a;" horiz-adv-x="1152" 
+d="M640 128q0 26 -19 45t-45 19t-45 -19t-19 -45t19 -45t45 -19t45 19t19 45zM1024 288v960q0 13 -9.5 22.5t-22.5 9.5h-832q-13 0 -22.5 -9.5t-9.5 -22.5v-960q0 -13 9.5 -22.5t22.5 -9.5h832q13 0 22.5 9.5t9.5 22.5zM1152 1248v-1088q0 -66 -47 -113t-113 -47h-832
+q-66 0 -113 47t-47 113v1088q0 66 47 113t113 47h832q66 0 113 -47t47 -113z" />
+    <glyph glyph-name="mobile_phone" unicode="&#xf10b;" horiz-adv-x="768" 
+d="M464 128q0 33 -23.5 56.5t-56.5 23.5t-56.5 -23.5t-23.5 -56.5t23.5 -56.5t56.5 -23.5t56.5 23.5t23.5 56.5zM672 288v704q0 13 -9.5 22.5t-22.5 9.5h-512q-13 0 -22.5 -9.5t-9.5 -22.5v-704q0 -13 9.5 -22.5t22.5 -9.5h512q13 0 22.5 9.5t9.5 22.5zM480 1136
+q0 16 -16 16h-160q-16 0 -16 -16t16 -16h160q16 0 16 16zM768 1152v-1024q0 -52 -38 -90t-90 -38h-512q-52 0 -90 38t-38 90v1024q0 52 38 90t90 38h512q52 0 90 -38t38 -90z" />
+    <glyph glyph-name="circle_blank" unicode="&#xf10c;" 
+d="M768 1184q-148 0 -273 -73t-198 -198t-73 -273t73 -273t198 -198t273 -73t273 73t198 198t73 273t-73 273t-198 198t-273 73zM1536 640q0 -209 -103 -385.5t-279.5 -279.5t-385.5 -103t-385.5 103t-279.5 279.5t-103 385.5t103 385.5t279.5 279.5t385.5 103t385.5 -103
+t279.5 -279.5t103 -385.5z" />
+    <glyph glyph-name="quote_left" unicode="&#xf10d;" horiz-adv-x="1664" 
+d="M768 576v-384q0 -80 -56 -136t-136 -56h-384q-80 0 -136 56t-56 136v704q0 104 40.5 198.5t109.5 163.5t163.5 109.5t198.5 40.5h64q26 0 45 -19t19 -45v-128q0 -26 -19 -45t-45 -19h-64q-106 0 -181 -75t-75 -181v-32q0 -40 28 -68t68 -28h224q80 0 136 -56t56 -136z
+M1664 576v-384q0 -80 -56 -136t-136 -56h-384q-80 0 -136 56t-56 136v704q0 104 40.5 198.5t109.5 163.5t163.5 109.5t198.5 40.5h64q26 0 45 -19t19 -45v-128q0 -26 -19 -45t-45 -19h-64q-106 0 -181 -75t-75 -181v-32q0 -40 28 -68t68 -28h224q80 0 136 -56t56 -136z" />
+    <glyph glyph-name="quote_right" unicode="&#xf10e;" horiz-adv-x="1664" 
+d="M768 1216v-704q0 -104 -40.5 -198.5t-109.5 -163.5t-163.5 -109.5t-198.5 -40.5h-64q-26 0 -45 19t-19 45v128q0 26 19 45t45 19h64q106 0 181 75t75 181v32q0 40 -28 68t-68 28h-224q-80 0 -136 56t-56 136v384q0 80 56 136t136 56h384q80 0 136 -56t56 -136zM1664 1216
+v-704q0 -104 -40.5 -198.5t-109.5 -163.5t-163.5 -109.5t-198.5 -40.5h-64q-26 0 -45 19t-19 45v128q0 26 19 45t45 19h64q106 0 181 75t75 181v32q0 40 -28 68t-68 28h-224q-80 0 -136 56t-56 136v384q0 80 56 136t136 56h384q80 0 136 -56t56 -136z" />
+    <glyph glyph-name="spinner" unicode="&#xf110;" horiz-adv-x="1792" 
+d="M526 142q0 -53 -37.5 -90.5t-90.5 -37.5q-52 0 -90 38t-38 90q0 53 37.5 90.5t90.5 37.5t90.5 -37.5t37.5 -90.5zM1024 -64q0 -53 -37.5 -90.5t-90.5 -37.5t-90.5 37.5t-37.5 90.5t37.5 90.5t90.5 37.5t90.5 -37.5t37.5 -90.5zM320 640q0 -53 -37.5 -90.5t-90.5 -37.5
+t-90.5 37.5t-37.5 90.5t37.5 90.5t90.5 37.5t90.5 -37.5t37.5 -90.5zM1522 142q0 -52 -38 -90t-90 -38q-53 0 -90.5 37.5t-37.5 90.5t37.5 90.5t90.5 37.5t90.5 -37.5t37.5 -90.5zM558 1138q0 -66 -47 -113t-113 -47t-113 47t-47 113t47 113t113 47t113 -47t47 -113z
+M1728 640q0 -53 -37.5 -90.5t-90.5 -37.5t-90.5 37.5t-37.5 90.5t37.5 90.5t90.5 37.5t90.5 -37.5t37.5 -90.5zM1088 1344q0 -80 -56 -136t-136 -56t-136 56t-56 136t56 136t136 56t136 -56t56 -136zM1618 1138q0 -93 -66 -158.5t-158 -65.5q-93 0 -158.5 65.5t-65.5 158.5
+q0 92 65.5 158t158.5 66q92 0 158 -66t66 -158z" />
+    <glyph glyph-name="circle" unicode="&#xf111;" 
+d="M1536 640q0 -209 -103 -385.5t-279.5 -279.5t-385.5 -103t-385.5 103t-279.5 279.5t-103 385.5t103 385.5t279.5 279.5t385.5 103t385.5 -103t279.5 -279.5t103 -385.5z" />
+    <glyph glyph-name="reply" unicode="&#xf112;" horiz-adv-x="1792" 
+d="M1792 416q0 -166 -127 -451q-3 -7 -10.5 -24t-13.5 -30t-13 -22q-12 -17 -28 -17q-15 0 -23.5 10t-8.5 25q0 9 2.5 26.5t2.5 23.5q5 68 5 123q0 101 -17.5 181t-48.5 138.5t-80 101t-105.5 69.5t-133 42.5t-154 21.5t-175.5 6h-224v-256q0 -26 -19 -45t-45 -19t-45 19
+l-512 512q-19 19 -19 45t19 45l512 512q19 19 45 19t45 -19t19 -45v-256h224q713 0 875 -403q53 -134 53 -333z" />
+    <glyph glyph-name="github_alt" unicode="&#xf113;" horiz-adv-x="1664" 
+d="M640 320q0 -40 -12.5 -82t-43 -76t-72.5 -34t-72.5 34t-43 76t-12.5 82t12.5 82t43 76t72.5 34t72.5 -34t43 -76t12.5 -82zM1280 320q0 -40 -12.5 -82t-43 -76t-72.5 -34t-72.5 34t-43 76t-12.5 82t12.5 82t43 76t72.5 34t72.5 -34t43 -76t12.5 -82zM1440 320
+q0 120 -69 204t-187 84q-41 0 -195 -21q-71 -11 -157 -11t-157 11q-152 21 -195 21q-118 0 -187 -84t-69 -204q0 -88 32 -153.5t81 -103t122 -60t140 -29.5t149 -7h168q82 0 149 7t140 29.5t122 60t81 103t32 153.5zM1664 496q0 -207 -61 -331q-38 -77 -105.5 -133t-141 -86
+t-170 -47.5t-171.5 -22t-167 -4.5q-78 0 -142 3t-147.5 12.5t-152.5 30t-137 51.5t-121 81t-86 115q-62 123 -62 331q0 237 136 396q-27 82 -27 170q0 116 51 218q108 0 190 -39.5t189 -123.5q147 35 309 35q148 0 280 -32q105 82 187 121t189 39q51 -102 51 -218
+q0 -87 -27 -168q136 -160 136 -398z" />
+    <glyph glyph-name="folder_close_alt" unicode="&#xf114;" horiz-adv-x="1664" 
+d="M1536 224v704q0 40 -28 68t-68 28h-704q-40 0 -68 28t-28 68v64q0 40 -28 68t-68 28h-320q-40 0 -68 -28t-28 -68v-960q0 -40 28 -68t68 -28h1216q40 0 68 28t28 68zM1664 928v-704q0 -92 -66 -158t-158 -66h-1216q-92 0 -158 66t-66 158v960q0 92 66 158t158 66h320
+q92 0 158 -66t66 -158v-32h672q92 0 158 -66t66 -158z" />
+    <glyph glyph-name="folder_open_alt" unicode="&#xf115;" horiz-adv-x="1920" 
+d="M1781 605q0 35 -53 35h-1088q-40 0 -85.5 -21.5t-71.5 -52.5l-294 -363q-18 -24 -18 -40q0 -35 53 -35h1088q40 0 86 22t71 53l294 363q18 22 18 39zM640 768h768v160q0 40 -28 68t-68 28h-576q-40 0 -68 28t-28 68v64q0 40 -28 68t-68 28h-320q-40 0 -68 -28t-28 -68
+v-853l256 315q44 53 116 87.5t140 34.5zM1909 605q0 -62 -46 -120l-295 -363q-43 -53 -116 -87.5t-140 -34.5h-1088q-92 0 -158 66t-66 158v960q0 92 66 158t158 66h320q92 0 158 -66t66 -158v-32h544q92 0 158 -66t66 -158v-160h192q54 0 99 -24.5t67 -70.5q15 -32 15 -68z
+" />
+    <glyph glyph-name="expand_alt" unicode="&#xf116;" horiz-adv-x="1792" 
+ />
+    <glyph glyph-name="collapse_alt" unicode="&#xf117;" horiz-adv-x="1792" 
+ />
+    <glyph glyph-name="smile" unicode="&#xf118;" 
+d="M1134 461q-37 -121 -138 -195t-228 -74t-228 74t-138 195q-8 25 4 48.5t38 31.5q25 8 48.5 -4t31.5 -38q25 -80 92.5 -129.5t151.5 -49.5t151.5 49.5t92.5 129.5q8 26 32 38t49 4t37 -31.5t4 -48.5zM640 896q0 -53 -37.5 -90.5t-90.5 -37.5t-90.5 37.5t-37.5 90.5
+t37.5 90.5t90.5 37.5t90.5 -37.5t37.5 -90.5zM1152 896q0 -53 -37.5 -90.5t-90.5 -37.5t-90.5 37.5t-37.5 90.5t37.5 90.5t90.5 37.5t90.5 -37.5t37.5 -90.5zM1408 640q0 130 -51 248.5t-136.5 204t-204 136.5t-248.5 51t-248.5 -51t-204 -136.5t-136.5 -204t-51 -248.5
+t51 -248.5t136.5 -204t204 -136.5t248.5 -51t248.5 51t204 136.5t136.5 204t51 248.5zM1536 640q0 -209 -103 -385.5t-279.5 -279.5t-385.5 -103t-385.5 103t-279.5 279.5t-103 385.5t103 385.5t279.5 279.5t385.5 103t385.5 -103t279.5 -279.5t103 -385.5z" />
+    <glyph glyph-name="frown" unicode="&#xf119;" 
+d="M1134 307q8 -25 -4 -48.5t-37 -31.5t-49 4t-32 38q-25 80 -92.5 129.5t-151.5 49.5t-151.5 -49.5t-92.5 -129.5q-8 -26 -31.5 -38t-48.5 -4q-26 8 -38 31.5t-4 48.5q37 121 138 195t228 74t228 -74t138 -195zM640 896q0 -53 -37.5 -90.5t-90.5 -37.5t-90.5 37.5
+t-37.5 90.5t37.5 90.5t90.5 37.5t90.5 -37.5t37.5 -90.5zM1152 896q0 -53 -37.5 -90.5t-90.5 -37.5t-90.5 37.5t-37.5 90.5t37.5 90.5t90.5 37.5t90.5 -37.5t37.5 -90.5zM1408 640q0 130 -51 248.5t-136.5 204t-204 136.5t-248.5 51t-248.5 -51t-204 -136.5t-136.5 -204
+t-51 -248.5t51 -248.5t136.5 -204t204 -136.5t248.5 -51t248.5 51t204 136.5t136.5 204t51 248.5zM1536 640q0 -209 -103 -385.5t-279.5 -279.5t-385.5 -103t-385.5 103t-279.5 279.5t-103 385.5t103 385.5t279.5 279.5t385.5 103t385.5 -103t279.5 -279.5t103 -385.5z" />
+    <glyph glyph-name="meh" unicode="&#xf11a;" 
+d="M1152 448q0 -26 -19 -45t-45 -19h-640q-26 0 -45 19t-19 45t19 45t45 19h640q26 0 45 -19t19 -45zM640 896q0 -53 -37.5 -90.5t-90.5 -37.5t-90.5 37.5t-37.5 90.5t37.5 90.5t90.5 37.5t90.5 -37.5t37.5 -90.5zM1152 896q0 -53 -37.5 -90.5t-90.5 -37.5t-90.5 37.5
+t-37.5 90.5t37.5 90.5t90.5 37.5t90.5 -37.5t37.5 -90.5zM1408 640q0 130 -51 248.5t-136.5 204t-204 136.5t-248.5 51t-248.5 -51t-204 -136.5t-136.5 -204t-51 -248.5t51 -248.5t136.5 -204t204 -136.5t248.5 -51t248.5 51t204 136.5t136.5 204t51 248.5zM1536 640
+q0 -209 -103 -385.5t-279.5 -279.5t-385.5 -103t-385.5 103t-279.5 279.5t-103 385.5t103 385.5t279.5 279.5t385.5 103t385.5 -103t279.5 -279.5t103 -385.5z" />
+    <glyph glyph-name="gamepad" unicode="&#xf11b;" horiz-adv-x="1920" 
+d="M832 448v128q0 14 -9 23t-23 9h-192v192q0 14 -9 23t-23 9h-128q-14 0 -23 -9t-9 -23v-192h-192q-14 0 -23 -9t-9 -23v-128q0 -14 9 -23t23 -9h192v-192q0 -14 9 -23t23 -9h128q14 0 23 9t9 23v192h192q14 0 23 9t9 23zM1408 384q0 53 -37.5 90.5t-90.5 37.5t-90.5 -37.5
+t-37.5 -90.5t37.5 -90.5t90.5 -37.5t90.5 37.5t37.5 90.5zM1664 640q0 53 -37.5 90.5t-90.5 37.5t-90.5 -37.5t-37.5 -90.5t37.5 -90.5t90.5 -37.5t90.5 37.5t37.5 90.5zM1920 512q0 -212 -150 -362t-362 -150q-192 0 -338 128h-220q-146 -128 -338 -128q-212 0 -362 150
+t-150 362t150 362t362 150h896q212 0 362 -150t150 -362z" />
+    <glyph glyph-name="keyboard" unicode="&#xf11c;" horiz-adv-x="1920" 
+d="M384 368v-96q0 -16 -16 -16h-96q-16 0 -16 16v96q0 16 16 16h96q16 0 16 -16zM512 624v-96q0 -16 -16 -16h-224q-16 0 -16 16v96q0 16 16 16h224q16 0 16 -16zM384 880v-96q0 -16 -16 -16h-96q-16 0 -16 16v96q0 16 16 16h96q16 0 16 -16zM1408 368v-96q0 -16 -16 -16
+h-864q-16 0 -16 16v96q0 16 16 16h864q16 0 16 -16zM768 624v-96q0 -16 -16 -16h-96q-16 0 -16 16v96q0 16 16 16h96q16 0 16 -16zM640 880v-96q0 -16 -16 -16h-96q-16 0 -16 16v96q0 16 16 16h96q16 0 16 -16zM1024 624v-96q0 -16 -16 -16h-96q-16 0 -16 16v96q0 16 16 16
+h96q16 0 16 -16zM896 880v-96q0 -16 -16 -16h-96q-16 0 -16 16v96q0 16 16 16h96q16 0 16 -16zM1280 624v-96q0 -16 -16 -16h-96q-16 0 -16 16v96q0 16 16 16h96q16 0 16 -16zM1664 368v-96q0 -16 -16 -16h-96q-16 0 -16 16v96q0 16 16 16h96q16 0 16 -16zM1152 880v-96
+q0 -16 -16 -16h-96q-16 0 -16 16v96q0 16 16 16h96q16 0 16 -16zM1408 880v-96q0 -16 -16 -16h-96q-16 0 -16 16v96q0 16 16 16h96q16 0 16 -16zM1664 880v-352q0 -16 -16 -16h-224q-16 0 -16 16v96q0 16 16 16h112v240q0 16 16 16h96q16 0 16 -16zM1792 128v896h-1664v-896
+h1664zM1920 1024v-896q0 -53 -37.5 -90.5t-90.5 -37.5h-1664q-53 0 -90.5 37.5t-37.5 90.5v896q0 53 37.5 90.5t90.5 37.5h1664q53 0 90.5 -37.5t37.5 -90.5z" />
+    <glyph glyph-name="flag_alt" unicode="&#xf11d;" horiz-adv-x="1792" 
+d="M1664 491v616q-169 -91 -306 -91q-82 0 -145 32q-100 49 -184 76.5t-178 27.5q-173 0 -403 -127v-599q245 113 433 113q55 0 103.5 -7.5t98 -26t77 -31t82.5 -39.5l28 -14q44 -22 101 -22q120 0 293 92zM320 1280q0 -35 -17.5 -64t-46.5 -46v-1266q0 -14 -9 -23t-23 -9
+h-64q-14 0 -23 9t-9 23v1266q-29 17 -46.5 46t-17.5 64q0 53 37.5 90.5t90.5 37.5t90.5 -37.5t37.5 -90.5zM1792 1216v-763q0 -39 -35 -57q-10 -5 -17 -9q-218 -116 -369 -116q-88 0 -158 35l-28 14q-64 33 -99 48t-91 29t-114 14q-102 0 -235.5 -44t-228.5 -102
+q-15 -9 -33 -9q-16 0 -32 8q-32 19 -32 56v742q0 35 31 55q35 21 78.5 42.5t114 52t152.5 49.5t155 19q112 0 209 -31t209 -86q38 -19 89 -19q122 0 310 112q22 12 31 17q31 16 62 -2q31 -20 31 -55z" />
+    <glyph glyph-name="flag_checkered" unicode="&#xf11e;" horiz-adv-x="1792" 
+d="M832 536v192q-181 -16 -384 -117v-185q205 96 384 110zM832 954v197q-172 -8 -384 -126v-189q215 111 384 118zM1664 491v184q-235 -116 -384 -71v224q-20 6 -39 15q-5 3 -33 17t-34.5 17t-31.5 15t-34.5 15.5t-32.5 13t-36 12.5t-35 8.5t-39.5 7.5t-39.5 4t-44 2
+q-23 0 -49 -3v-222h19q102 0 192.5 -29t197.5 -82q19 -9 39 -15v-188q42 -17 91 -17q120 0 293 92zM1664 918v189q-169 -91 -306 -91q-45 0 -78 8v-196q148 -42 384 90zM320 1280q0 -35 -17.5 -64t-46.5 -46v-1266q0 -14 -9 -23t-23 -9h-64q-14 0 -23 9t-9 23v1266
+q-29 17 -46.5 46t-17.5 64q0 53 37.5 90.5t90.5 37.5t90.5 -37.5t37.5 -90.5zM1792 1216v-763q0 -39 -35 -57q-10 -5 -17 -9q-218 -116 -369 -116q-88 0 -158 35l-28 14q-64 33 -99 48t-91 29t-114 14q-102 0 -235.5 -44t-228.5 -102q-15 -9 -33 -9q-16 0 -32 8
+q-32 19 -32 56v742q0 35 31 55q35 21 78.5 42.5t114 52t152.5 49.5t155 19q112 0 209 -31t209 -86q38 -19 89 -19q122 0 310 112q22 12 31 17q31 16 62 -2q31 -20 31 -55z" />
+    <glyph glyph-name="terminal" unicode="&#xf120;" horiz-adv-x="1664" 
+d="M585 553l-466 -466q-10 -10 -23 -10t-23 10l-50 50q-10 10 -10 23t10 23l393 393l-393 393q-10 10 -10 23t10 23l50 50q10 10 23 10t23 -10l466 -466q10 -10 10 -23t-10 -23zM1664 96v-64q0 -14 -9 -23t-23 -9h-960q-14 0 -23 9t-9 23v64q0 14 9 23t23 9h960q14 0 23 -9
+t9 -23z" />
+    <glyph glyph-name="code" unicode="&#xf121;" horiz-adv-x="1920" 
+d="M617 137l-50 -50q-10 -10 -23 -10t-23 10l-466 466q-10 10 -10 23t10 23l466 466q10 10 23 10t23 -10l50 -50q10 -10 10 -23t-10 -23l-393 -393l393 -393q10 -10 10 -23t-10 -23zM1208 1204l-373 -1291q-4 -13 -15.5 -19.5t-23.5 -2.5l-62 17q-13 4 -19.5 15.5t-2.5 24.5
+l373 1291q4 13 15.5 19.5t23.5 2.5l62 -17q13 -4 19.5 -15.5t2.5 -24.5zM1865 553l-466 -466q-10 -10 -23 -10t-23 10l-50 50q-10 10 -10 23t10 23l393 393l-393 393q-10 10 -10 23t10 23l50 50q10 10 23 10t23 -10l466 -466q10 -10 10 -23t-10 -23z" />
+    <glyph glyph-name="reply_all" unicode="&#xf122;" horiz-adv-x="1792" 
+d="M640 454v-70q0 -42 -39 -59q-13 -5 -25 -5q-27 0 -45 19l-512 512q-19 19 -19 45t19 45l512 512q29 31 70 14q39 -17 39 -59v-69l-397 -398q-19 -19 -19 -45t19 -45zM1792 416q0 -58 -17 -133.5t-38.5 -138t-48 -125t-40.5 -90.5l-20 -40q-8 -17 -28 -17q-6 0 -9 1
+q-25 8 -23 34q43 400 -106 565q-64 71 -170.5 110.5t-267.5 52.5v-251q0 -42 -39 -59q-13 -5 -25 -5q-27 0 -45 19l-512 512q-19 19 -19 45t19 45l512 512q29 31 70 14q39 -17 39 -59v-262q411 -28 599 -221q169 -173 169 -509z" />
+    <glyph glyph-name="star_half_empty" unicode="&#xf123;" horiz-adv-x="1664" 
+d="M1186 579l257 250l-356 52l-66 10l-30 60l-159 322v-963l59 -31l318 -168l-60 355l-12 66zM1638 841l-363 -354l86 -500q5 -33 -6 -51.5t-34 -18.5q-17 0 -40 12l-449 236l-449 -236q-23 -12 -40 -12q-23 0 -34 18.5t-6 51.5l86 500l-364 354q-32 32 -23 59.5t54 34.5
+l502 73l225 455q20 41 49 41q28 0 49 -41l225 -455l502 -73q45 -7 54 -34.5t-24 -59.5z" />
+    <glyph glyph-name="location_arrow" unicode="&#xf124;" horiz-adv-x="1408" 
+d="M1401 1187l-640 -1280q-17 -35 -57 -35q-5 0 -15 2q-22 5 -35.5 22.5t-13.5 39.5v576h-576q-22 0 -39.5 13.5t-22.5 35.5t4 42t29 30l1280 640q13 7 29 7q27 0 45 -19q15 -14 18.5 -34.5t-6.5 -39.5z" />
+    <glyph glyph-name="crop" unicode="&#xf125;" horiz-adv-x="1664" 
+d="M557 256h595v595zM512 301l595 595h-595v-595zM1664 224v-192q0 -14 -9 -23t-23 -9h-224v-224q0 -14 -9 -23t-23 -9h-192q-14 0 -23 9t-9 23v224h-864q-14 0 -23 9t-9 23v864h-224q-14 0 -23 9t-9 23v192q0 14 9 23t23 9h224v224q0 14 9 23t23 9h192q14 0 23 -9t9 -23
+v-224h851l246 247q10 9 23 9t23 -9q9 -10 9 -23t-9 -23l-247 -246v-851h224q14 0 23 -9t9 -23z" />
+    <glyph glyph-name="code_fork" unicode="&#xf126;" horiz-adv-x="1024" 
+d="M288 64q0 40 -28 68t-68 28t-68 -28t-28 -68t28 -68t68 -28t68 28t28 68zM288 1216q0 40 -28 68t-68 28t-68 -28t-28 -68t28 -68t68 -28t68 28t28 68zM928 1088q0 40 -28 68t-68 28t-68 -28t-28 -68t28 -68t68 -28t68 28t28 68zM1024 1088q0 -52 -26 -96.5t-70 -69.5
+q-2 -287 -226 -414q-67 -38 -203 -81q-128 -40 -169.5 -71t-41.5 -100v-26q44 -25 70 -69.5t26 -96.5q0 -80 -56 -136t-136 -56t-136 56t-56 136q0 52 26 96.5t70 69.5v820q-44 25 -70 69.5t-26 96.5q0 80 56 136t136 56t136 -56t56 -136q0 -52 -26 -96.5t-70 -69.5v-497
+q54 26 154 57q55 17 87.5 29.5t70.5 31t59 39.5t40.5 51t28 69.5t8.5 91.5q-44 25 -70 69.5t-26 96.5q0 80 56 136t136 56t136 -56t56 -136z" />
+    <glyph glyph-name="unlink" unicode="&#xf127;" horiz-adv-x="1664" 
+d="M439 265l-256 -256q-11 -9 -23 -9t-23 9q-9 10 -9 23t9 23l256 256q10 9 23 9t23 -9q9 -10 9 -23t-9 -23zM608 224v-320q0 -14 -9 -23t-23 -9t-23 9t-9 23v320q0 14 9 23t23 9t23 -9t9 -23zM384 448q0 -14 -9 -23t-23 -9h-320q-14 0 -23 9t-9 23t9 23t23 9h320
+q14 0 23 -9t9 -23zM1648 320q0 -120 -85 -203l-147 -146q-83 -83 -203 -83q-121 0 -204 85l-334 335q-21 21 -42 56l239 18l273 -274q27 -27 68 -27.5t68 26.5l147 146q28 28 28 67q0 40 -28 68l-274 275l18 239q35 -21 56 -42l336 -336q84 -86 84 -204zM1031 1044l-239 -18
+l-273 274q-28 28 -68 28q-39 0 -68 -27l-147 -146q-28 -28 -28 -67q0 -40 28 -68l274 -274l-18 -240q-35 21 -56 42l-336 336q-84 86 -84 204q0 120 85 203l147 146q83 83 203 83q121 0 204 -85l334 -335q21 -21 42 -56zM1664 960q0 -14 -9 -23t-23 -9h-320q-14 0 -23 9
+t-9 23t9 23t23 9h320q14 0 23 -9t9 -23zM1120 1504v-320q0 -14 -9 -23t-23 -9t-23 9t-9 23v320q0 14 9 23t23 9t23 -9t9 -23zM1527 1353l-256 -256q-11 -9 -23 -9t-23 9q-9 10 -9 23t9 23l256 256q10 9 23 9t23 -9q9 -10 9 -23t-9 -23z" />
+    <glyph glyph-name="question" unicode="&#xf128;" horiz-adv-x="1024" 
+d="M704 280v-240q0 -16 -12 -28t-28 -12h-240q-16 0 -28 12t-12 28v240q0 16 12 28t28 12h240q16 0 28 -12t12 -28zM1020 880q0 -54 -15.5 -101t-35 -76.5t-55 -59.5t-57.5 -43.5t-61 -35.5q-41 -23 -68.5 -65t-27.5 -67q0 -17 -12 -32.5t-28 -15.5h-240q-15 0 -25.5 18.5
+t-10.5 37.5v45q0 83 65 156.5t143 108.5q59 27 84 56t25 76q0 42 -46.5 74t-107.5 32q-65 0 -108 -29q-35 -25 -107 -115q-13 -16 -31 -16q-12 0 -25 8l-164 125q-13 10 -15.5 25t5.5 28q160 266 464 266q80 0 161 -31t146 -83t106 -127.5t41 -158.5z" />
+    <glyph glyph-name="_279" unicode="&#xf129;" horiz-adv-x="640" 
+d="M640 192v-128q0 -26 -19 -45t-45 -19h-512q-26 0 -45 19t-19 45v128q0 26 19 45t45 19h64v384h-64q-26 0 -45 19t-19 45v128q0 26 19 45t45 19h384q26 0 45 -19t19 -45v-576h64q26 0 45 -19t19 -45zM512 1344v-192q0 -26 -19 -45t-45 -19h-256q-26 0 -45 19t-19 45v192
+q0 26 19 45t45 19h256q26 0 45 -19t19 -45z" />
+    <glyph glyph-name="exclamation" unicode="&#xf12a;" horiz-adv-x="640" 
+d="M512 288v-224q0 -26 -19 -45t-45 -19h-256q-26 0 -45 19t-19 45v224q0 26 19 45t45 19h256q26 0 45 -19t19 -45zM542 1344l-28 -768q-1 -26 -20.5 -45t-45.5 -19h-256q-26 0 -45.5 19t-20.5 45l-28 768q-1 26 17.5 45t44.5 19h320q26 0 44.5 -19t17.5 -45z" />
+    <glyph glyph-name="superscript" unicode="&#xf12b;" 
+d="M897 167v-167h-248l-159 252l-24 42q-8 9 -11 21h-3q-1 -3 -2.5 -6.5t-3.5 -8t-3 -6.5q-10 -20 -25 -44l-155 -250h-258v167h128l197 291l-185 272h-137v168h276l139 -228q2 -4 23 -42q8 -9 11 -21h3q3 9 11 21l25 42l140 228h257v-168h-125l-184 -267l204 -296h109z
+M1534 846v-206h-514l-3 27q-4 28 -4 46q0 64 26 117t65 86.5t84 65t84 54.5t65 54t26 64q0 38 -29.5 62.5t-70.5 24.5q-51 0 -97 -39q-14 -11 -36 -38l-105 92q26 37 63 66q83 65 188 65q110 0 178 -59.5t68 -158.5q0 -56 -24.5 -103t-62 -76.5t-81.5 -58.5t-82 -50.5
+t-65.5 -51.5t-30.5 -63h232v80h126z" />
+    <glyph glyph-name="subscript" unicode="&#xf12c;" 
+d="M897 167v-167h-248l-159 252l-24 42q-8 9 -11 21h-3q-1 -3 -2.5 -6.5t-3.5 -8t-3 -6.5q-10 -20 -25 -44l-155 -250h-258v167h128l197 291l-185 272h-137v168h276l139 -228q2 -4 23 -42q8 -9 11 -21h3q3 9 11 21l25 42l140 228h257v-168h-125l-184 -267l204 -296h109z
+M1536 -50v-206h-514l-4 27q-3 45 -3 46q0 64 26 117t65 86.5t84 65t84 54.5t65 54t26 64q0 38 -29.5 62.5t-70.5 24.5q-51 0 -97 -39q-14 -11 -36 -38l-105 92q26 37 63 66q80 65 188 65q110 0 178 -59.5t68 -158.5q0 -66 -34.5 -118.5t-84 -86t-99.5 -62.5t-87 -63t-41 -73
+h232v80h126z" />
+    <glyph glyph-name="_283" unicode="&#xf12d;" horiz-adv-x="1920" 
+d="M896 128l336 384h-768l-336 -384h768zM1909 1205q15 -34 9.5 -71.5t-30.5 -65.5l-896 -1024q-38 -44 -96 -44h-768q-38 0 -69.5 20.5t-47.5 54.5q-15 34 -9.5 71.5t30.5 65.5l896 1024q38 44 96 44h768q38 0 69.5 -20.5t47.5 -54.5z" />
+    <glyph glyph-name="puzzle_piece" unicode="&#xf12e;" horiz-adv-x="1664" 
+d="M1664 438q0 -81 -44.5 -135t-123.5 -54q-41 0 -77.5 17.5t-59 38t-56.5 38t-71 17.5q-110 0 -110 -124q0 -39 16 -115t15 -115v-5q-22 0 -33 -1q-34 -3 -97.5 -11.5t-115.5 -13.5t-98 -5q-61 0 -103 26.5t-42 83.5q0 37 17.5 71t38 56.5t38 59t17.5 77.5q0 79 -54 123.5
+t-135 44.5q-84 0 -143 -45.5t-59 -127.5q0 -43 15 -83t33.5 -64.5t33.5 -53t15 -50.5q0 -45 -46 -89q-37 -35 -117 -35q-95 0 -245 24q-9 2 -27.5 4t-27.5 4l-13 2q-1 0 -3 1q-2 0 -2 1v1024q2 -1 17.5 -3.5t34 -5t21.5 -3.5q150 -24 245 -24q80 0 117 35q46 44 46 89
+q0 22 -15 50.5t-33.5 53t-33.5 64.5t-15 83q0 82 59 127.5t144 45.5q80 0 134 -44.5t54 -123.5q0 -41 -17.5 -77.5t-38 -59t-38 -56.5t-17.5 -71q0 -57 42 -83.5t103 -26.5q64 0 180 15t163 17v-2q-1 -2 -3.5 -17.5t-5 -34t-3.5 -21.5q-24 -150 -24 -245q0 -80 35 -117
+q44 -46 89 -46q22 0 50.5 15t53 33.5t64.5 33.5t83 15q82 0 127.5 -59t45.5 -143z" />
+    <glyph glyph-name="microphone" unicode="&#xf130;" horiz-adv-x="1152" 
+d="M1152 832v-128q0 -221 -147.5 -384.5t-364.5 -187.5v-132h256q26 0 45 -19t19 -45t-19 -45t-45 -19h-640q-26 0 -45 19t-19 45t19 45t45 19h256v132q-217 24 -364.5 187.5t-147.5 384.5v128q0 26 19 45t45 19t45 -19t19 -45v-128q0 -185 131.5 -316.5t316.5 -131.5
+t316.5 131.5t131.5 316.5v128q0 26 19 45t45 19t45 -19t19 -45zM896 1216v-512q0 -132 -94 -226t-226 -94t-226 94t-94 226v512q0 132 94 226t226 94t226 -94t94 -226z" />
+    <glyph glyph-name="microphone_off" unicode="&#xf131;" horiz-adv-x="1408" 
+d="M271 591l-101 -101q-42 103 -42 214v128q0 26 19 45t45 19t45 -19t19 -45v-128q0 -53 15 -113zM1385 1193l-361 -361v-128q0 -132 -94 -226t-226 -94q-55 0 -109 19l-96 -96q97 -51 205 -51q185 0 316.5 131.5t131.5 316.5v128q0 26 19 45t45 19t45 -19t19 -45v-128
+q0 -221 -147.5 -384.5t-364.5 -187.5v-132h256q26 0 45 -19t19 -45t-19 -45t-45 -19h-640q-26 0 -45 19t-19 45t19 45t45 19h256v132q-125 13 -235 81l-254 -254q-10 -10 -23 -10t-23 10l-82 82q-10 10 -10 23t10 23l1234 1234q10 10 23 10t23 -10l82 -82q10 -10 10 -23
+t-10 -23zM1005 1325l-621 -621v512q0 132 94 226t226 94q102 0 184.5 -59t116.5 -152z" />
+    <glyph glyph-name="shield" unicode="&#xf132;" horiz-adv-x="1280" 
+d="M1088 576v640h-448v-1137q119 63 213 137q235 184 235 360zM1280 1344v-768q0 -86 -33.5 -170.5t-83 -150t-118 -127.5t-126.5 -103t-121 -77.5t-89.5 -49.5t-42.5 -20q-12 -6 -26 -6t-26 6q-16 7 -42.5 20t-89.5 49.5t-121 77.5t-126.5 103t-118 127.5t-83 150
+t-33.5 170.5v768q0 26 19 45t45 19h1152q26 0 45 -19t19 -45z" />
+    <glyph glyph-name="calendar_empty" unicode="&#xf133;" horiz-adv-x="1664" 
+d="M128 -128h1408v1024h-1408v-1024zM512 1088v288q0 14 -9 23t-23 9h-64q-14 0 -23 -9t-9 -23v-288q0 -14 9 -23t23 -9h64q14 0 23 9t9 23zM1280 1088v288q0 14 -9 23t-23 9h-64q-14 0 -23 -9t-9 -23v-288q0 -14 9 -23t23 -9h64q14 0 23 9t9 23zM1664 1152v-1280
+q0 -52 -38 -90t-90 -38h-1408q-52 0 -90 38t-38 90v1280q0 52 38 90t90 38h128v96q0 66 47 113t113 47h64q66 0 113 -47t47 -113v-96h384v96q0 66 47 113t113 47h64q66 0 113 -47t47 -113v-96h128q52 0 90 -38t38 -90z" />
+    <glyph glyph-name="fire_extinguisher" unicode="&#xf134;" horiz-adv-x="1408" 
+d="M512 1344q0 26 -19 45t-45 19t-45 -19t-19 -45t19 -45t45 -19t45 19t19 45zM1408 1376v-320q0 -16 -12 -25q-8 -7 -20 -7q-4 0 -7 1l-448 96q-11 2 -18 11t-7 20h-256v-102q111 -23 183.5 -111t72.5 -203v-800q0 -26 -19 -45t-45 -19h-512q-26 0 -45 19t-19 45v800
+q0 106 62.5 190.5t161.5 114.5v111h-32q-59 0 -115 -23.5t-91.5 -53t-66 -66.5t-40.5 -53.5t-14 -24.5q-17 -35 -57 -35q-16 0 -29 7q-23 12 -31.5 37t3.5 49q5 10 14.5 26t37.5 53.5t60.5 70t85 67t108.5 52.5q-25 42 -25 86q0 66 47 113t113 47t113 -47t47 -113
+q0 -33 -14 -64h302q0 11 7 20t18 11l448 96q3 1 7 1q12 0 20 -7q12 -9 12 -25z" />
+    <glyph glyph-name="rocket" unicode="&#xf135;" horiz-adv-x="1664" 
+d="M1440 1088q0 40 -28 68t-68 28t-68 -28t-28 -68t28 -68t68 -28t68 28t28 68zM1664 1376q0 -249 -75.5 -430.5t-253.5 -360.5q-81 -80 -195 -176l-20 -379q-2 -16 -16 -26l-384 -224q-7 -4 -16 -4q-12 0 -23 9l-64 64q-13 14 -8 32l85 276l-281 281l-276 -85q-3 -1 -9 -1
+q-14 0 -23 9l-64 64q-17 19 -5 39l224 384q10 14 26 16l379 20q96 114 176 195q188 187 358 258t431 71q14 0 24 -9.5t10 -22.5z" />
+    <glyph glyph-name="maxcdn" unicode="&#xf136;" horiz-adv-x="1792" 
+d="M1745 763l-164 -763h-334l178 832q13 56 -15 88q-27 33 -83 33h-169l-204 -953h-334l204 953h-286l-204 -953h-334l204 953l-153 327h1276q101 0 189.5 -40.5t147.5 -113.5q60 -73 81 -168.5t0 -194.5z" />
+    <glyph glyph-name="chevron_sign_left" unicode="&#xf137;" 
+d="M909 141l102 102q19 19 19 45t-19 45l-307 307l307 307q19 19 19 45t-19 45l-102 102q-19 19 -45 19t-45 -19l-454 -454q-19 -19 -19 -45t19 -45l454 -454q19 -19 45 -19t45 19zM1536 640q0 -209 -103 -385.5t-279.5 -279.5t-385.5 -103t-385.5 103t-279.5 279.5
+t-103 385.5t103 385.5t279.5 279.5t385.5 103t385.5 -103t279.5 -279.5t103 -385.5z" />
+    <glyph glyph-name="chevron_sign_right" unicode="&#xf138;" 
+d="M717 141l454 454q19 19 19 45t-19 45l-454 454q-19 19 -45 19t-45 -19l-102 -102q-19 -19 -19 -45t19 -45l307 -307l-307 -307q-19 -19 -19 -45t19 -45l102 -102q19 -19 45 -19t45 19zM1536 640q0 -209 -103 -385.5t-279.5 -279.5t-385.5 -103t-385.5 103t-279.5 279.5
+t-103 385.5t103 385.5t279.5 279.5t385.5 103t385.5 -103t279.5 -279.5t103 -385.5z" />
+    <glyph glyph-name="chevron_sign_up" unicode="&#xf139;" 
+d="M1165 397l102 102q19 19 19 45t-19 45l-454 454q-19 19 -45 19t-45 -19l-454 -454q-19 -19 -19 -45t19 -45l102 -102q19 -19 45 -19t45 19l307 307l307 -307q19 -19 45 -19t45 19zM1536 640q0 -209 -103 -385.5t-279.5 -279.5t-385.5 -103t-385.5 103t-279.5 279.5
+t-103 385.5t103 385.5t279.5 279.5t385.5 103t385.5 -103t279.5 -279.5t103 -385.5z" />
+    <glyph glyph-name="chevron_sign_down" unicode="&#xf13a;" 
+d="M813 237l454 454q19 19 19 45t-19 45l-102 102q-19 19 -45 19t-45 -19l-307 -307l-307 307q-19 19 -45 19t-45 -19l-102 -102q-19 -19 -19 -45t19 -45l454 -454q19 -19 45 -19t45 19zM1536 640q0 -209 -103 -385.5t-279.5 -279.5t-385.5 -103t-385.5 103t-279.5 279.5
+t-103 385.5t103 385.5t279.5 279.5t385.5 103t385.5 -103t279.5 -279.5t103 -385.5z" />
+    <glyph glyph-name="html5" unicode="&#xf13b;" horiz-adv-x="1408" 
+d="M1130 939l16 175h-884l47 -534h612l-22 -228l-197 -53l-196 53l-13 140h-175l22 -278l362 -100h4v1l359 99l50 544h-644l-15 181h674zM0 1408h1408l-128 -1438l-578 -162l-574 162z" />
+    <glyph glyph-name="css3" unicode="&#xf13c;" horiz-adv-x="1792" 
+d="M275 1408h1505l-266 -1333l-804 -267l-698 267l71 356h297l-29 -147l422 -161l486 161l68 339h-1208l58 297h1209l38 191h-1208z" />
+    <glyph glyph-name="anchor" unicode="&#xf13d;" horiz-adv-x="1792" 
+d="M960 1280q0 26 -19 45t-45 19t-45 -19t-19 -45t19 -45t45 -19t45 19t19 45zM1792 352v-352q0 -22 -20 -30q-8 -2 -12 -2q-12 0 -23 9l-93 93q-119 -143 -318.5 -226.5t-429.5 -83.5t-429.5 83.5t-318.5 226.5l-93 -93q-9 -9 -23 -9q-4 0 -12 2q-20 8 -20 30v352
+q0 14 9 23t23 9h352q22 0 30 -20q8 -19 -7 -35l-100 -100q67 -91 189.5 -153.5t271.5 -82.5v647h-192q-26 0 -45 19t-19 45v128q0 26 19 45t45 19h192v163q-58 34 -93 92.5t-35 128.5q0 106 75 181t181 75t181 -75t75 -181q0 -70 -35 -128.5t-93 -92.5v-163h192q26 0 45 -19
+t19 -45v-128q0 -26 -19 -45t-45 -19h-192v-647q149 20 271.5 82.5t189.5 153.5l-100 100q-15 16 -7 35q8 20 30 20h352q14 0 23 -9t9 -23z" />
+    <glyph glyph-name="unlock_alt" unicode="&#xf13e;" horiz-adv-x="1152" 
+d="M1056 768q40 0 68 -28t28 -68v-576q0 -40 -28 -68t-68 -28h-960q-40 0 -68 28t-28 68v576q0 40 28 68t68 28h32v320q0 185 131.5 316.5t316.5 131.5t316.5 -131.5t131.5 -316.5q0 -26 -19 -45t-45 -19h-64q-26 0 -45 19t-19 45q0 106 -75 181t-181 75t-181 -75t-75 -181
+v-320h736z" />
+    <glyph glyph-name="bullseye" unicode="&#xf140;" 
+d="M1024 640q0 -106 -75 -181t-181 -75t-181 75t-75 181t75 181t181 75t181 -75t75 -181zM1152 640q0 159 -112.5 271.5t-271.5 112.5t-271.5 -112.5t-112.5 -271.5t112.5 -271.5t271.5 -112.5t271.5 112.5t112.5 271.5zM1280 640q0 -212 -150 -362t-362 -150t-362 150
+t-150 362t150 362t362 150t362 -150t150 -362zM1408 640q0 130 -51 248.5t-136.5 204t-204 136.5t-248.5 51t-248.5 -51t-204 -136.5t-136.5 -204t-51 -248.5t51 -248.5t136.5 -204t204 -136.5t248.5 -51t248.5 51t204 136.5t136.5 204t51 248.5zM1536 640
+q0 -209 -103 -385.5t-279.5 -279.5t-385.5 -103t-385.5 103t-279.5 279.5t-103 385.5t103 385.5t279.5 279.5t385.5 103t385.5 -103t279.5 -279.5t103 -385.5z" />
+    <glyph glyph-name="ellipsis_horizontal" unicode="&#xf141;" horiz-adv-x="1408" 
+d="M384 800v-192q0 -40 -28 -68t-68 -28h-192q-40 0 -68 28t-28 68v192q0 40 28 68t68 28h192q40 0 68 -28t28 -68zM896 800v-192q0 -40 -28 -68t-68 -28h-192q-40 0 -68 28t-28 68v192q0 40 28 68t68 28h192q40 0 68 -28t28 -68zM1408 800v-192q0 -40 -28 -68t-68 -28h-192
+q-40 0 -68 28t-28 68v192q0 40 28 68t68 28h192q40 0 68 -28t28 -68z" />
+    <glyph glyph-name="ellipsis_vertical" unicode="&#xf142;" horiz-adv-x="384" 
+d="M384 288v-192q0 -40 -28 -68t-68 -28h-192q-40 0 -68 28t-28 68v192q0 40 28 68t68 28h192q40 0 68 -28t28 -68zM384 800v-192q0 -40 -28 -68t-68 -28h-192q-40 0 -68 28t-28 68v192q0 40 28 68t68 28h192q40 0 68 -28t28 -68zM384 1312v-192q0 -40 -28 -68t-68 -28h-192
+q-40 0 -68 28t-28 68v192q0 40 28 68t68 28h192q40 0 68 -28t28 -68z" />
+    <glyph glyph-name="_303" unicode="&#xf143;" 
+d="M512 256q0 53 -37.5 90.5t-90.5 37.5t-90.5 -37.5t-37.5 -90.5t37.5 -90.5t90.5 -37.5t90.5 37.5t37.5 90.5zM863 162q-13 233 -176.5 396.5t-396.5 176.5q-14 1 -24 -9t-10 -23v-128q0 -13 8.5 -22t21.5 -10q154 -11 264 -121t121 -264q1 -13 10 -21.5t22 -8.5h128
+q13 0 23 10t9 24zM1247 161q-5 154 -56 297.5t-139.5 260t-205 205t-260 139.5t-297.5 56q-14 1 -23 -9q-10 -10 -10 -23v-128q0 -13 9 -22t22 -10q204 -7 378 -111.5t278.5 -278.5t111.5 -378q1 -13 10 -22t22 -9h128q13 0 23 10q11 9 9 23zM1536 1120v-960
+q0 -119 -84.5 -203.5t-203.5 -84.5h-960q-119 0 -203.5 84.5t-84.5 203.5v960q0 119 84.5 203.5t203.5 84.5h960q119 0 203.5 -84.5t84.5 -203.5z" />
+    <glyph glyph-name="play_sign" unicode="&#xf144;" 
+d="M768 1408q209 0 385.5 -103t279.5 -279.5t103 -385.5t-103 -385.5t-279.5 -279.5t-385.5 -103t-385.5 103t-279.5 279.5t-103 385.5t103 385.5t279.5 279.5t385.5 103zM1152 585q32 18 32 55t-32 55l-544 320q-31 19 -64 1q-32 -19 -32 -56v-640q0 -37 32 -56
+q16 -8 32 -8q17 0 32 9z" />
+    <glyph glyph-name="ticket" unicode="&#xf145;" horiz-adv-x="1792" 
+d="M1024 1084l316 -316l-572 -572l-316 316zM813 105l618 618q19 19 19 45t-19 45l-362 362q-18 18 -45 18t-45 -18l-618 -618q-19 -19 -19 -45t19 -45l362 -362q18 -18 45 -18t45 18zM1702 742l-907 -908q-37 -37 -90.5 -37t-90.5 37l-126 126q56 56 56 136t-56 136
+t-136 56t-136 -56l-125 126q-37 37 -37 90.5t37 90.5l907 906q37 37 90.5 37t90.5 -37l125 -125q-56 -56 -56 -136t56 -136t136 -56t136 56l126 -125q37 -37 37 -90.5t-37 -90.5z" />
+    <glyph glyph-name="minus_sign_alt" unicode="&#xf146;" 
+d="M1280 576v128q0 26 -19 45t-45 19h-896q-26 0 -45 -19t-19 -45v-128q0 -26 19 -45t45 -19h896q26 0 45 19t19 45zM1536 1120v-960q0 -119 -84.5 -203.5t-203.5 -84.5h-960q-119 0 -203.5 84.5t-84.5 203.5v960q0 119 84.5 203.5t203.5 84.5h960q119 0 203.5 -84.5
+t84.5 -203.5z" />
+    <glyph glyph-name="check_minus" unicode="&#xf147;" horiz-adv-x="1408" 
+d="M1152 736v-64q0 -14 -9 -23t-23 -9h-832q-14 0 -23 9t-9 23v64q0 14 9 23t23 9h832q14 0 23 -9t9 -23zM1280 288v832q0 66 -47 113t-113 47h-832q-66 0 -113 -47t-47 -113v-832q0 -66 47 -113t113 -47h832q66 0 113 47t47 113zM1408 1120v-832q0 -119 -84.5 -203.5
+t-203.5 -84.5h-832q-119 0 -203.5 84.5t-84.5 203.5v832q0 119 84.5 203.5t203.5 84.5h832q119 0 203.5 -84.5t84.5 -203.5z" />
+    <glyph glyph-name="level_up" unicode="&#xf148;" horiz-adv-x="1024" 
+d="M1018 933q-18 -37 -58 -37h-192v-864q0 -14 -9 -23t-23 -9h-704q-21 0 -29 18q-8 20 4 35l160 192q9 11 25 11h320v640h-192q-40 0 -58 37q-17 37 9 68l320 384q18 22 49 22t49 -22l320 -384q27 -32 9 -68z" />
+    <glyph glyph-name="level_down" unicode="&#xf149;" horiz-adv-x="1024" 
+d="M32 1280h704q13 0 22.5 -9.5t9.5 -23.5v-863h192q40 0 58 -37t-9 -69l-320 -384q-18 -22 -49 -22t-49 22l-320 384q-26 31 -9 69q18 37 58 37h192v640h-320q-14 0 -25 11l-160 192q-13 14 -4 34q9 19 29 19z" />
+    <glyph glyph-name="check_sign" unicode="&#xf14a;" 
+d="M685 237l614 614q19 19 19 45t-19 45l-102 102q-19 19 -45 19t-45 -19l-467 -467l-211 211q-19 19 -45 19t-45 -19l-102 -102q-19 -19 -19 -45t19 -45l358 -358q19 -19 45 -19t45 19zM1536 1120v-960q0 -119 -84.5 -203.5t-203.5 -84.5h-960q-119 0 -203.5 84.5
+t-84.5 203.5v960q0 119 84.5 203.5t203.5 84.5h960q119 0 203.5 -84.5t84.5 -203.5z" />
+    <glyph glyph-name="edit_sign" unicode="&#xf14b;" 
+d="M404 428l152 -152l-52 -52h-56v96h-96v56zM818 818q14 -13 -3 -30l-291 -291q-17 -17 -30 -3q-14 13 3 30l291 291q17 17 30 3zM544 128l544 544l-288 288l-544 -544v-288h288zM1152 736l92 92q28 28 28 68t-28 68l-152 152q-28 28 -68 28t-68 -28l-92 -92zM1536 1120
+v-960q0 -119 -84.5 -203.5t-203.5 -84.5h-960q-119 0 -203.5 84.5t-84.5 203.5v960q0 119 84.5 203.5t203.5 84.5h960q119 0 203.5 -84.5t84.5 -203.5z" />
+    <glyph glyph-name="_312" unicode="&#xf14c;" 
+d="M1280 608v480q0 26 -19 45t-45 19h-480q-42 0 -59 -39q-17 -41 14 -70l144 -144l-534 -534q-19 -19 -19 -45t19 -45l102 -102q19 -19 45 -19t45 19l534 534l144 -144q18 -19 45 -19q12 0 25 5q39 17 39 59zM1536 1120v-960q0 -119 -84.5 -203.5t-203.5 -84.5h-960
+q-119 0 -203.5 84.5t-84.5 203.5v960q0 119 84.5 203.5t203.5 84.5h960q119 0 203.5 -84.5t84.5 -203.5z" />
+    <glyph glyph-name="share_sign" unicode="&#xf14d;" 
+d="M1005 435l352 352q19 19 19 45t-19 45l-352 352q-30 31 -69 14q-40 -17 -40 -59v-160q-119 0 -216 -19.5t-162.5 -51t-114 -79t-76.5 -95.5t-44.5 -109t-21.5 -111.5t-5 -110.5q0 -181 167 -404q11 -12 25 -12q7 0 13 3q22 9 19 33q-44 354 62 473q46 52 130 75.5
+t224 23.5v-160q0 -42 40 -59q12 -5 24 -5q26 0 45 19zM1536 1120v-960q0 -119 -84.5 -203.5t-203.5 -84.5h-960q-119 0 -203.5 84.5t-84.5 203.5v960q0 119 84.5 203.5t203.5 84.5h960q119 0 203.5 -84.5t84.5 -203.5z" />
+    <glyph glyph-name="compass" unicode="&#xf14e;" 
+d="M640 448l256 128l-256 128v-256zM1024 1039v-542l-512 -256v542zM1312 640q0 148 -73 273t-198 198t-273 73t-273 -73t-198 -198t-73 -273t73 -273t198 -198t273 -73t273 73t198 198t73 273zM1536 640q0 -209 -103 -385.5t-279.5 -279.5t-385.5 -103t-385.5 103
+t-279.5 279.5t-103 385.5t103 385.5t279.5 279.5t385.5 103t385.5 -103t279.5 -279.5t103 -385.5z" />
+    <glyph glyph-name="collapse" unicode="&#xf150;" 
+d="M1145 861q18 -35 -5 -66l-320 -448q-19 -27 -52 -27t-52 27l-320 448q-23 31 -5 66q17 35 57 35h640q40 0 57 -35zM1280 160v960q0 13 -9.5 22.5t-22.5 9.5h-960q-13 0 -22.5 -9.5t-9.5 -22.5v-960q0 -13 9.5 -22.5t22.5 -9.5h960q13 0 22.5 9.5t9.5 22.5zM1536 1120
+v-960q0 -119 -84.5 -203.5t-203.5 -84.5h-960q-119 0 -203.5 84.5t-84.5 203.5v960q0 119 84.5 203.5t203.5 84.5h960q119 0 203.5 -84.5t84.5 -203.5z" />
+    <glyph glyph-name="collapse_top" unicode="&#xf151;" 
+d="M1145 419q-17 -35 -57 -35h-640q-40 0 -57 35q-18 35 5 66l320 448q19 27 52 27t52 -27l320 -448q23 -31 5 -66zM1280 160v960q0 13 -9.5 22.5t-22.5 9.5h-960q-13 0 -22.5 -9.5t-9.5 -22.5v-960q0 -13 9.5 -22.5t22.5 -9.5h960q13 0 22.5 9.5t9.5 22.5zM1536 1120v-960
+q0 -119 -84.5 -203.5t-203.5 -84.5h-960q-119 0 -203.5 84.5t-84.5 203.5v960q0 119 84.5 203.5t203.5 84.5h960q119 0 203.5 -84.5t84.5 -203.5z" />
+    <glyph glyph-name="_317" unicode="&#xf152;" 
+d="M1088 640q0 -33 -27 -52l-448 -320q-31 -23 -66 -5q-35 17 -35 57v640q0 40 35 57q35 18 66 -5l448 -320q27 -19 27 -52zM1280 160v960q0 14 -9 23t-23 9h-960q-14 0 -23 -9t-9 -23v-960q0 -14 9 -23t23 -9h960q14 0 23 9t9 23zM1536 1120v-960q0 -119 -84.5 -203.5
+t-203.5 -84.5h-960q-119 0 -203.5 84.5t-84.5 203.5v960q0 119 84.5 203.5t203.5 84.5h960q119 0 203.5 -84.5t84.5 -203.5z" />
+    <glyph glyph-name="eur" unicode="&#xf153;" horiz-adv-x="1024" 
+d="M976 229l35 -159q3 -12 -3 -22.5t-17 -14.5l-5 -1q-4 -2 -10.5 -3.5t-16 -4.5t-21.5 -5.5t-25.5 -5t-30 -5t-33.5 -4.5t-36.5 -3t-38.5 -1q-234 0 -409 130.5t-238 351.5h-95q-13 0 -22.5 9.5t-9.5 22.5v113q0 13 9.5 22.5t22.5 9.5h66q-2 57 1 105h-67q-14 0 -23 9
+t-9 23v114q0 14 9 23t23 9h98q67 210 243.5 338t400.5 128q102 0 194 -23q11 -3 20 -15q6 -11 3 -24l-43 -159q-3 -13 -14 -19.5t-24 -2.5l-4 1q-4 1 -11.5 2.5l-17.5 3.5t-22.5 3.5t-26 3t-29 2.5t-29.5 1q-126 0 -226 -64t-150 -176h468q16 0 25 -12q10 -12 7 -26
+l-24 -114q-5 -26 -32 -26h-488q-3 -37 0 -105h459q15 0 25 -12q9 -12 6 -27l-24 -112q-2 -11 -11 -18.5t-20 -7.5h-387q48 -117 149.5 -185.5t228.5 -68.5q18 0 36 1.5t33.5 3.5t29.5 4.5t24.5 5t18.5 4.5l12 3l5 2q13 5 26 -2q12 -7 15 -21z" />
+    <glyph glyph-name="gbp" unicode="&#xf154;" horiz-adv-x="1024" 
+d="M1020 399v-367q0 -14 -9 -23t-23 -9h-956q-14 0 -23 9t-9 23v150q0 13 9.5 22.5t22.5 9.5h97v383h-95q-14 0 -23 9.5t-9 22.5v131q0 14 9 23t23 9h95v223q0 171 123.5 282t314.5 111q185 0 335 -125q9 -8 10 -20.5t-7 -22.5l-103 -127q-9 -11 -22 -12q-13 -2 -23 7
+q-5 5 -26 19t-69 32t-93 18q-85 0 -137 -47t-52 -123v-215h305q13 0 22.5 -9t9.5 -23v-131q0 -13 -9.5 -22.5t-22.5 -9.5h-305v-379h414v181q0 13 9 22.5t23 9.5h162q14 0 23 -9.5t9 -22.5z" />
+    <glyph glyph-name="usd" unicode="&#xf155;" horiz-adv-x="1024" 
+d="M978 351q0 -153 -99.5 -263.5t-258.5 -136.5v-175q0 -14 -9 -23t-23 -9h-135q-13 0 -22.5 9.5t-9.5 22.5v175q-66 9 -127.5 31t-101.5 44.5t-74 48t-46.5 37.5t-17.5 18q-17 21 -2 41l103 135q7 10 23 12q15 2 24 -9l2 -2q113 -99 243 -125q37 -8 74 -8q81 0 142.5 43
+t61.5 122q0 28 -15 53t-33.5 42t-58.5 37.5t-66 32t-80 32.5q-39 16 -61.5 25t-61.5 26.5t-62.5 31t-56.5 35.5t-53.5 42.5t-43.5 49t-35.5 58t-21 66.5t-8.5 78q0 138 98 242t255 134v180q0 13 9.5 22.5t22.5 9.5h135q14 0 23 -9t9 -23v-176q57 -6 110.5 -23t87 -33.5
+t63.5 -37.5t39 -29t15 -14q17 -18 5 -38l-81 -146q-8 -15 -23 -16q-14 -3 -27 7q-3 3 -14.5 12t-39 26.5t-58.5 32t-74.5 26t-85.5 11.5q-95 0 -155 -43t-60 -111q0 -26 8.5 -48t29.5 -41.5t39.5 -33t56 -31t60.5 -27t70 -27.5q53 -20 81 -31.5t76 -35t75.5 -42.5t62 -50
+t53 -63.5t31.5 -76.5t13 -94z" />
+    <glyph glyph-name="inr" unicode="&#xf156;" horiz-adv-x="898" 
+d="M898 1066v-102q0 -14 -9 -23t-23 -9h-168q-23 -144 -129 -234t-276 -110q167 -178 459 -536q14 -16 4 -34q-8 -18 -29 -18h-195q-16 0 -25 12q-306 367 -498 571q-9 9 -9 22v127q0 13 9.5 22.5t22.5 9.5h112q132 0 212.5 43t102.5 125h-427q-14 0 -23 9t-9 23v102
+q0 14 9 23t23 9h413q-57 113 -268 113h-145q-13 0 -22.5 9.5t-9.5 22.5v133q0 14 9 23t23 9h832q14 0 23 -9t9 -23v-102q0 -14 -9 -23t-23 -9h-233q47 -61 64 -144h171q14 0 23 -9t9 -23z" />
+    <glyph glyph-name="jpy" unicode="&#xf157;" horiz-adv-x="1027" 
+d="M603 0h-172q-13 0 -22.5 9t-9.5 23v330h-288q-13 0 -22.5 9t-9.5 23v103q0 13 9.5 22.5t22.5 9.5h288v85h-288q-13 0 -22.5 9t-9.5 23v104q0 13 9.5 22.5t22.5 9.5h214l-321 578q-8 16 0 32q10 16 28 16h194q19 0 29 -18l215 -425q19 -38 56 -125q10 24 30.5 68t27.5 61
+l191 420q8 19 29 19h191q17 0 27 -16q9 -14 1 -31l-313 -579h215q13 0 22.5 -9.5t9.5 -22.5v-104q0 -14 -9.5 -23t-22.5 -9h-290v-85h290q13 0 22.5 -9.5t9.5 -22.5v-103q0 -14 -9.5 -23t-22.5 -9h-290v-330q0 -13 -9.5 -22.5t-22.5 -9.5z" />
+    <glyph glyph-name="rub" unicode="&#xf158;" horiz-adv-x="1280" 
+d="M1043 971q0 100 -65 162t-171 62h-320v-448h320q106 0 171 62t65 162zM1280 971q0 -193 -126.5 -315t-326.5 -122h-340v-118h505q14 0 23 -9t9 -23v-128q0 -14 -9 -23t-23 -9h-505v-192q0 -14 -9.5 -23t-22.5 -9h-167q-14 0 -23 9t-9 23v192h-224q-14 0 -23 9t-9 23v128
+q0 14 9 23t23 9h224v118h-224q-14 0 -23 9t-9 23v149q0 13 9 22.5t23 9.5h224v629q0 14 9 23t23 9h539q200 0 326.5 -122t126.5 -315z" />
+    <glyph glyph-name="krw" unicode="&#xf159;" horiz-adv-x="1792" 
+d="M514 341l81 299h-159l75 -300q1 -1 1 -3t1 -3q0 1 0.5 3.5t0.5 3.5zM630 768l35 128h-292l32 -128h225zM822 768h139l-35 128h-70zM1271 340l78 300h-162l81 -299q0 -1 0.5 -3.5t1.5 -3.5q0 1 0.5 3t0.5 3zM1382 768l33 128h-297l34 -128h230zM1792 736v-64q0 -14 -9 -23
+t-23 -9h-213l-164 -616q-7 -24 -31 -24h-159q-24 0 -31 24l-166 616h-209l-167 -616q-7 -24 -31 -24h-159q-11 0 -19.5 7t-10.5 17l-160 616h-208q-14 0 -23 9t-9 23v64q0 14 9 23t23 9h175l-33 128h-142q-14 0 -23 9t-9 23v64q0 14 9 23t23 9h109l-89 344q-5 15 5 28
+q10 12 26 12h137q26 0 31 -24l90 -360h359l97 360q7 24 31 24h126q24 0 31 -24l98 -360h365l93 360q5 24 31 24h137q16 0 26 -12q10 -13 5 -28l-91 -344h111q14 0 23 -9t9 -23v-64q0 -14 -9 -23t-23 -9h-145l-34 -128h179q14 0 23 -9t9 -23z" />
+    <glyph glyph-name="btc" unicode="&#xf15a;" horiz-adv-x="1280" 
+d="M1167 896q18 -182 -131 -258q117 -28 175 -103t45 -214q-7 -71 -32.5 -125t-64.5 -89t-97 -58.5t-121.5 -34.5t-145.5 -15v-255h-154v251q-80 0 -122 1v-252h-154v255q-18 0 -54 0.5t-55 0.5h-200l31 183h111q50 0 58 51v402h16q-6 1 -16 1v287q-13 68 -89 68h-111v164
+l212 -1q64 0 97 1v252h154v-247q82 2 122 2v245h154v-252q79 -7 140 -22.5t113 -45t82.5 -78t36.5 -114.5zM952 351q0 36 -15 64t-37 46t-57.5 30.5t-65.5 18.5t-74 9t-69 3t-64.5 -1t-47.5 -1v-338q8 0 37 -0.5t48 -0.5t53 1.5t58.5 4t57 8.5t55.5 14t47.5 21t39.5 30
+t24.5 40t9.5 51zM881 827q0 33 -12.5 58.5t-30.5 42t-48 28t-55 16.5t-61.5 8t-58 2.5t-54 -1t-39.5 -0.5v-307q5 0 34.5 -0.5t46.5 0t50 2t55 5.5t51.5 11t48.5 18.5t37 27t27 38.5t9 51z" />
+    <glyph glyph-name="file" unicode="&#xf15b;" 
+d="M1024 1024v472q22 -14 36 -28l408 -408q14 -14 28 -36h-472zM896 992q0 -40 28 -68t68 -28h544v-1056q0 -40 -28 -68t-68 -28h-1344q-40 0 -68 28t-28 68v1600q0 40 28 68t68 28h800v-544z" />
+    <glyph glyph-name="file_text" unicode="&#xf15c;" 
+d="M1468 1060q14 -14 28 -36h-472v472q22 -14 36 -28zM992 896h544v-1056q0 -40 -28 -68t-68 -28h-1344q-40 0 -68 28t-28 68v1600q0 40 28 68t68 28h800v-544q0 -40 28 -68t68 -28zM1152 160v64q0 14 -9 23t-23 9h-704q-14 0 -23 -9t-9 -23v-64q0 -14 9 -23t23 -9h704
+q14 0 23 9t9 23zM1152 416v64q0 14 -9 23t-23 9h-704q-14 0 -23 -9t-9 -23v-64q0 -14 9 -23t23 -9h704q14 0 23 9t9 23zM1152 672v64q0 14 -9 23t-23 9h-704q-14 0 -23 -9t-9 -23v-64q0 -14 9 -23t23 -9h704q14 0 23 9t9 23z" />
+    <glyph glyph-name="sort_by_alphabet" unicode="&#xf15d;" horiz-adv-x="1664" 
+d="M1191 1128h177l-72 218l-12 47q-2 16 -2 20h-4l-3 -20q0 -1 -3.5 -18t-7.5 -29zM736 96q0 -12 -10 -24l-319 -319q-10 -9 -23 -9q-12 0 -23 9l-320 320q-15 16 -7 35q8 20 30 20h192v1376q0 14 9 23t23 9h192q14 0 23 -9t9 -23v-1376h192q14 0 23 -9t9 -23zM1572 -23
+v-233h-584v90l369 529q12 18 21 27l11 9v3q-2 0 -6.5 -0.5t-7.5 -0.5q-12 -3 -30 -3h-232v-115h-120v229h567v-89l-369 -530q-6 -8 -21 -26l-11 -11v-2l14 2q9 2 30 2h248v119h121zM1661 874v-106h-288v106h75l-47 144h-243l-47 -144h75v-106h-287v106h70l230 662h162
+l230 -662h70z" />
+    <glyph glyph-name="_329" unicode="&#xf15e;" horiz-adv-x="1664" 
+d="M1191 104h177l-72 218l-12 47q-2 16 -2 20h-4l-3 -20q0 -1 -3.5 -18t-7.5 -29zM736 96q0 -12 -10 -24l-319 -319q-10 -9 -23 -9q-12 0 -23 9l-320 320q-15 16 -7 35q8 20 30 20h192v1376q0 14 9 23t23 9h192q14 0 23 -9t9 -23v-1376h192q14 0 23 -9t9 -23zM1661 -150
+v-106h-288v106h75l-47 144h-243l-47 -144h75v-106h-287v106h70l230 662h162l230 -662h70zM1572 1001v-233h-584v90l369 529q12 18 21 27l11 9v3q-2 0 -6.5 -0.5t-7.5 -0.5q-12 -3 -30 -3h-232v-115h-120v229h567v-89l-369 -530q-6 -8 -21 -26l-11 -10v-3l14 3q9 1 30 1h248
+v119h121z" />
+    <glyph glyph-name="sort_by_attributes" unicode="&#xf160;" horiz-adv-x="1792" 
+d="M736 96q0 -12 -10 -24l-319 -319q-10 -9 -23 -9q-12 0 -23 9l-320 320q-15 16 -7 35q8 20 30 20h192v1376q0 14 9 23t23 9h192q14 0 23 -9t9 -23v-1376h192q14 0 23 -9t9 -23zM1792 -32v-192q0 -14 -9 -23t-23 -9h-832q-14 0 -23 9t-9 23v192q0 14 9 23t23 9h832
+q14 0 23 -9t9 -23zM1600 480v-192q0 -14 -9 -23t-23 -9h-640q-14 0 -23 9t-9 23v192q0 14 9 23t23 9h640q14 0 23 -9t9 -23zM1408 992v-192q0 -14 -9 -23t-23 -9h-448q-14 0 -23 9t-9 23v192q0 14 9 23t23 9h448q14 0 23 -9t9 -23zM1216 1504v-192q0 -14 -9 -23t-23 -9h-256
+q-14 0 -23 9t-9 23v192q0 14 9 23t23 9h256q14 0 23 -9t9 -23z" />
+    <glyph glyph-name="sort_by_attributes_alt" unicode="&#xf161;" horiz-adv-x="1792" 
+d="M1216 -32v-192q0 -14 -9 -23t-23 -9h-256q-14 0 -23 9t-9 23v192q0 14 9 23t23 9h256q14 0 23 -9t9 -23zM736 96q0 -12 -10 -24l-319 -319q-10 -9 -23 -9q-12 0 -23 9l-320 320q-15 16 -7 35q8 20 30 20h192v1376q0 14 9 23t23 9h192q14 0 23 -9t9 -23v-1376h192
+q14 0 23 -9t9 -23zM1408 480v-192q0 -14 -9 -23t-23 -9h-448q-14 0 -23 9t-9 23v192q0 14 9 23t23 9h448q14 0 23 -9t9 -23zM1600 992v-192q0 -14 -9 -23t-23 -9h-640q-14 0 -23 9t-9 23v192q0 14 9 23t23 9h640q14 0 23 -9t9 -23zM1792 1504v-192q0 -14 -9 -23t-23 -9h-832
+q-14 0 -23 9t-9 23v192q0 14 9 23t23 9h832q14 0 23 -9t9 -23z" />
+    <glyph glyph-name="sort_by_order" unicode="&#xf162;" 
+d="M1346 223q0 63 -44 116t-103 53q-52 0 -83 -37t-31 -94t36.5 -95t104.5 -38q50 0 85 27t35 68zM736 96q0 -12 -10 -24l-319 -319q-10 -9 -23 -9q-12 0 -23 9l-320 320q-15 16 -7 35q8 20 30 20h192v1376q0 14 9 23t23 9h192q14 0 23 -9t9 -23v-1376h192q14 0 23 -9t9 -23
+zM1486 165q0 -62 -13 -121.5t-41 -114t-68 -95.5t-98.5 -65.5t-127.5 -24.5q-62 0 -108 16q-24 8 -42 15l39 113q15 -7 31 -11q37 -13 75 -13q84 0 134.5 58.5t66.5 145.5h-2q-21 -23 -61.5 -37t-84.5 -14q-106 0 -173 71.5t-67 172.5q0 105 72 178t181 73q123 0 205 -94.5
+t82 -252.5zM1456 882v-114h-469v114h167v432q0 7 0.5 19t0.5 17v16h-2l-7 -12q-8 -13 -26 -31l-62 -58l-82 86l192 185h123v-654h165z" />
+    <glyph glyph-name="sort_by_order_alt" unicode="&#xf163;" 
+d="M1346 1247q0 63 -44 116t-103 53q-52 0 -83 -37t-31 -94t36.5 -95t104.5 -38q50 0 85 27t35 68zM736 96q0 -12 -10 -24l-319 -319q-10 -9 -23 -9q-12 0 -23 9l-320 320q-15 16 -7 35q8 20 30 20h192v1376q0 14 9 23t23 9h192q14 0 23 -9t9 -23v-1376h192q14 0 23 -9
+t9 -23zM1456 -142v-114h-469v114h167v432q0 7 0.5 19t0.5 17v16h-2l-7 -12q-8 -13 -26 -31l-62 -58l-82 86l192 185h123v-654h165zM1486 1189q0 -62 -13 -121.5t-41 -114t-68 -95.5t-98.5 -65.5t-127.5 -24.5q-62 0 -108 16q-24 8 -42 15l39 113q15 -7 31 -11q37 -13 75 -13
+q84 0 134.5 58.5t66.5 145.5h-2q-21 -23 -61.5 -37t-84.5 -14q-106 0 -173 71.5t-67 172.5q0 105 72 178t181 73q123 0 205 -94.5t82 -252.5z" />
+    <glyph glyph-name="_334" unicode="&#xf164;" horiz-adv-x="1664" 
+d="M256 192q0 26 -19 45t-45 19q-27 0 -45.5 -19t-18.5 -45q0 -27 18.5 -45.5t45.5 -18.5q26 0 45 18.5t19 45.5zM416 704v-640q0 -26 -19 -45t-45 -19h-288q-26 0 -45 19t-19 45v640q0 26 19 45t45 19h288q26 0 45 -19t19 -45zM1600 704q0 -86 -55 -149q15 -44 15 -76
+q3 -76 -43 -137q17 -56 0 -117q-15 -57 -54 -94q9 -112 -49 -181q-64 -76 -197 -78h-36h-76h-17q-66 0 -144 15.5t-121.5 29t-120.5 39.5q-123 43 -158 44q-26 1 -45 19.5t-19 44.5v641q0 25 18 43.5t43 20.5q24 2 76 59t101 121q68 87 101 120q18 18 31 48t17.5 48.5
+t13.5 60.5q7 39 12.5 61t19.5 52t34 50q19 19 45 19q46 0 82.5 -10.5t60 -26t40 -40.5t24 -45t12 -50t5 -45t0.5 -39q0 -38 -9.5 -76t-19 -60t-27.5 -56q-3 -6 -10 -18t-11 -22t-8 -24h277q78 0 135 -57t57 -135z" />
+    <glyph glyph-name="_335" unicode="&#xf165;" horiz-adv-x="1664" 
+d="M256 960q0 -26 -19 -45t-45 -19q-27 0 -45.5 19t-18.5 45q0 27 18.5 45.5t45.5 18.5q26 0 45 -18.5t19 -45.5zM416 448v640q0 26 -19 45t-45 19h-288q-26 0 -45 -19t-19 -45v-640q0 -26 19 -45t45 -19h288q26 0 45 19t19 45zM1545 597q55 -61 55 -149q-1 -78 -57.5 -135
+t-134.5 -57h-277q4 -14 8 -24t11 -22t10 -18q18 -37 27 -57t19 -58.5t10 -76.5q0 -24 -0.5 -39t-5 -45t-12 -50t-24 -45t-40 -40.5t-60 -26t-82.5 -10.5q-26 0 -45 19q-20 20 -34 50t-19.5 52t-12.5 61q-9 42 -13.5 60.5t-17.5 48.5t-31 48q-33 33 -101 120q-49 64 -101 121
+t-76 59q-25 2 -43 20.5t-18 43.5v641q0 26 19 44.5t45 19.5q35 1 158 44q77 26 120.5 39.5t121.5 29t144 15.5h17h76h36q133 -2 197 -78q58 -69 49 -181q39 -37 54 -94q17 -61 0 -117q46 -61 43 -137q0 -32 -15 -76z" />
+    <glyph glyph-name="youtube_sign" unicode="&#xf166;" 
+d="M919 233v157q0 50 -29 50q-17 0 -33 -16v-224q16 -16 33 -16q29 0 29 49zM1103 355h66v34q0 51 -33 51t-33 -51v-34zM532 621v-70h-80v-423h-74v423h-78v70h232zM733 495v-367h-67v40q-39 -45 -76 -45q-33 0 -42 28q-6 17 -6 54v290h66v-270q0 -24 1 -26q1 -15 15 -15
+q20 0 42 31v280h67zM985 384v-146q0 -52 -7 -73q-12 -42 -53 -42q-35 0 -68 41v-36h-67v493h67v-161q32 40 68 40q41 0 53 -42q7 -21 7 -74zM1236 255v-9q0 -29 -2 -43q-3 -22 -15 -40q-27 -40 -80 -40q-52 0 -81 38q-21 27 -21 86v129q0 59 20 86q29 38 80 38t78 -38
+q21 -29 21 -86v-76h-133v-65q0 -51 34 -51q24 0 30 26q0 1 0.5 7t0.5 16.5v21.5h68zM785 1079v-156q0 -51 -32 -51t-32 51v156q0 52 32 52t32 -52zM1318 366q0 177 -19 260q-10 44 -43 73.5t-76 34.5q-136 15 -412 15q-275 0 -411 -15q-44 -5 -76.5 -34.5t-42.5 -73.5
+q-20 -87 -20 -260q0 -176 20 -260q10 -43 42.5 -73t75.5 -35q137 -15 412 -15t412 15q43 5 75.5 35t42.5 73q20 84 20 260zM563 1017l90 296h-75l-51 -195l-53 195h-78q7 -23 23 -69l24 -69q35 -103 46 -158v-201h74v201zM852 936v130q0 58 -21 87q-29 38 -78 38
+q-51 0 -78 -38q-21 -29 -21 -87v-130q0 -58 21 -87q27 -38 78 -38q49 0 78 38q21 27 21 87zM1033 816h67v370h-67v-283q-22 -31 -42 -31q-15 0 -16 16q-1 2 -1 26v272h-67v-293q0 -37 6 -55q11 -27 43 -27q36 0 77 45v-40zM1536 1120v-960q0 -119 -84.5 -203.5t-203.5 -84.5
+h-960q-119 0 -203.5 84.5t-84.5 203.5v960q0 119 84.5 203.5t203.5 84.5h960q119 0 203.5 -84.5t84.5 -203.5z" />
+    <glyph glyph-name="youtube" unicode="&#xf167;" 
+d="M971 292v-211q0 -67 -39 -67q-23 0 -45 22v301q22 22 45 22q39 0 39 -67zM1309 291v-46h-90v46q0 68 45 68t45 -68zM343 509h107v94h-312v-94h105v-569h100v569zM631 -60h89v494h-89v-378q-30 -42 -57 -42q-18 0 -21 21q-1 3 -1 35v364h-89v-391q0 -49 8 -73
+q12 -37 58 -37q48 0 102 61v-54zM1060 88v197q0 73 -9 99q-17 56 -71 56q-50 0 -93 -54v217h-89v-663h89v48q45 -55 93 -55q54 0 71 55q9 27 9 100zM1398 98v13h-91q0 -51 -2 -61q-7 -36 -40 -36q-46 0 -46 69v87h179v103q0 79 -27 116q-39 51 -106 51q-68 0 -107 -51
+q-28 -37 -28 -116v-173q0 -79 29 -116q39 -51 108 -51q72 0 108 53q18 27 21 54q2 9 2 58zM790 1011v210q0 69 -43 69t-43 -69v-210q0 -70 43 -70t43 70zM1509 260q0 -234 -26 -350q-14 -59 -58 -99t-102 -46q-184 -21 -555 -21t-555 21q-58 6 -102.5 46t-57.5 99
+q-26 112 -26 350q0 234 26 350q14 59 58 99t103 47q183 20 554 20t555 -20q58 -7 102.5 -47t57.5 -99q26 -112 26 -350zM511 1536h102l-121 -399v-271h-100v271q-14 74 -61 212q-37 103 -65 187h106l71 -263zM881 1203v-175q0 -81 -28 -118q-38 -51 -106 -51q-67 0 -105 51
+q-28 38 -28 118v175q0 80 28 117q38 51 105 51q68 0 106 -51q28 -37 28 -117zM1216 1365v-499h-91v55q-53 -62 -103 -62q-46 0 -59 37q-8 24 -8 75v394h91v-367q0 -33 1 -35q3 -22 21 -22q27 0 57 43v381h91z" />
+    <glyph glyph-name="xing" unicode="&#xf168;" horiz-adv-x="1408" 
+d="M597 869q-10 -18 -257 -456q-27 -46 -65 -46h-239q-21 0 -31 17t0 36l253 448q1 0 0 1l-161 279q-12 22 -1 37q9 15 32 15h239q40 0 66 -45zM1403 1511q11 -16 0 -37l-528 -934v-1l336 -615q11 -20 1 -37q-10 -15 -32 -15h-239q-42 0 -66 45l-339 622q18 32 531 942
+q25 45 64 45h241q22 0 31 -15z" />
+    <glyph glyph-name="xing_sign" unicode="&#xf169;" 
+d="M685 771q0 1 -126 222q-21 34 -52 34h-184q-18 0 -26 -11q-7 -12 1 -29l125 -216v-1l-196 -346q-9 -14 0 -28q8 -13 24 -13h185q31 0 50 36zM1309 1268q-7 12 -24 12h-187q-30 0 -49 -35l-411 -729q1 -2 262 -481q20 -35 52 -35h184q18 0 25 12q8 13 -1 28l-260 476v1
+l409 723q8 16 0 28zM1536 1120v-960q0 -119 -84.5 -203.5t-203.5 -84.5h-960q-119 0 -203.5 84.5t-84.5 203.5v960q0 119 84.5 203.5t203.5 84.5h960q119 0 203.5 -84.5t84.5 -203.5z" />
+    <glyph glyph-name="youtube_play" unicode="&#xf16a;" horiz-adv-x="1792" 
+d="M711 408l484 250l-484 253v-503zM896 1270q168 0 324.5 -4.5t229.5 -9.5l73 -4q1 0 17 -1.5t23 -3t23.5 -4.5t28.5 -8t28 -13t31 -19.5t29 -26.5q6 -6 15.5 -18.5t29 -58.5t26.5 -101q8 -64 12.5 -136.5t5.5 -113.5v-40v-136q1 -145 -18 -290q-7 -55 -25 -99.5t-32 -61.5
+l-14 -17q-14 -15 -29 -26.5t-31 -19t-28 -12.5t-28.5 -8t-24 -4.5t-23 -3t-16.5 -1.5q-251 -19 -627 -19q-207 2 -359.5 6.5t-200.5 7.5l-49 4l-36 4q-36 5 -54.5 10t-51 21t-56.5 41q-6 6 -15.5 18.5t-29 58.5t-26.5 101q-8 64 -12.5 136.5t-5.5 113.5v40v136
+q-1 145 18 290q7 55 25 99.5t32 61.5l14 17q14 15 29 26.5t31 19.5t28 13t28.5 8t23.5 4.5t23 3t17 1.5q251 18 627 18z" />
+    <glyph glyph-name="dropbox" unicode="&#xf16b;" horiz-adv-x="1792" 
+d="M402 829l494 -305l-342 -285l-490 319zM1388 274v-108l-490 -293v-1l-1 1l-1 -1v1l-489 293v108l147 -96l342 284v2l1 -1l1 1v-2l343 -284zM554 1418l342 -285l-494 -304l-338 270zM1390 829l338 -271l-489 -319l-343 285zM1239 1418l489 -319l-338 -270l-494 304z" />
+    <glyph glyph-name="stackexchange" unicode="&#xf16c;" 
+d="M1289 -96h-1118v480h-160v-640h1438v640h-160v-480zM347 428l33 157l783 -165l-33 -156zM450 802l67 146l725 -339l-67 -145zM651 1158l102 123l614 -513l-102 -123zM1048 1536l477 -641l-128 -96l-477 641zM330 65v159h800v-159h-800z" />
+    <glyph glyph-name="instagram" unicode="&#xf16d;" 
+d="M1024 640q0 106 -75 181t-181 75t-181 -75t-75 -181t75 -181t181 -75t181 75t75 181zM1162 640q0 -164 -115 -279t-279 -115t-279 115t-115 279t115 279t279 115t279 -115t115 -279zM1270 1050q0 -38 -27 -65t-65 -27t-65 27t-27 65t27 65t65 27t65 -27t27 -65zM768 1270
+q-7 0 -76.5 0.5t-105.5 0t-96.5 -3t-103 -10t-71.5 -18.5q-50 -20 -88 -58t-58 -88q-11 -29 -18.5 -71.5t-10 -103t-3 -96.5t0 -105.5t0.5 -76.5t-0.5 -76.5t0 -105.5t3 -96.5t10 -103t18.5 -71.5q20 -50 58 -88t88 -58q29 -11 71.5 -18.5t103 -10t96.5 -3t105.5 0t76.5 0.5
+t76.5 -0.5t105.5 0t96.5 3t103 10t71.5 18.5q50 20 88 58t58 88q11 29 18.5 71.5t10 103t3 96.5t0 105.5t-0.5 76.5t0.5 76.5t0 105.5t-3 96.5t-10 103t-18.5 71.5q-20 50 -58 88t-88 58q-29 11 -71.5 18.5t-103 10t-96.5 3t-105.5 0t-76.5 -0.5zM1536 640q0 -229 -5 -317
+q-10 -208 -124 -322t-322 -124q-88 -5 -317 -5t-317 5q-208 10 -322 124t-124 322q-5 88 -5 317t5 317q10 208 124 322t322 124q88 5 317 5t317 -5q208 -10 322 -124t124 -322q5 -88 5 -317z" />
+    <glyph glyph-name="flickr" unicode="&#xf16e;" 
+d="M1248 1408q119 0 203.5 -84.5t84.5 -203.5v-960q0 -119 -84.5 -203.5t-203.5 -84.5h-960q-119 0 -203.5 84.5t-84.5 203.5v960q0 119 84.5 203.5t203.5 84.5h960zM698 640q0 88 -62 150t-150 62t-150 -62t-62 -150t62 -150t150 -62t150 62t62 150zM1262 640q0 88 -62 150
+t-150 62t-150 -62t-62 -150t62 -150t150 -62t150 62t62 150z" />
+    <glyph glyph-name="adn" unicode="&#xf170;" 
+d="M768 914l201 -306h-402zM1133 384h94l-459 691l-459 -691h94l104 160h522zM1536 640q0 -209 -103 -385.5t-279.5 -279.5t-385.5 -103t-385.5 103t-279.5 279.5t-103 385.5t103 385.5t279.5 279.5t385.5 103t385.5 -103t279.5 -279.5t103 -385.5z" />
+    <glyph glyph-name="f171" unicode="&#xf171;" horiz-adv-x="1408" 
+d="M815 677q8 -63 -50.5 -101t-111.5 -6q-39 17 -53.5 58t-0.5 82t52 58q36 18 72.5 12t64 -35.5t27.5 -67.5zM926 698q-14 107 -113 164t-197 13q-63 -28 -100.5 -88.5t-34.5 -129.5q4 -91 77.5 -155t165.5 -56q91 8 152 84t50 168zM1165 1240q-20 27 -56 44.5t-58 22
+t-71 12.5q-291 47 -566 -2q-43 -7 -66 -12t-55 -22t-50 -43q30 -28 76 -45.5t73.5 -22t87.5 -11.5q228 -29 448 -1q63 8 89.5 12t72.5 21.5t75 46.5zM1222 205q-8 -26 -15.5 -76.5t-14 -84t-28.5 -70t-58 -56.5q-86 -48 -189.5 -71.5t-202 -22t-201.5 18.5q-46 8 -81.5 18
+t-76.5 27t-73 43.5t-52 61.5q-25 96 -57 292l6 16l18 9q223 -148 506.5 -148t507.5 148q21 -6 24 -23t-5 -45t-8 -37zM1403 1166q-26 -167 -111 -655q-5 -30 -27 -56t-43.5 -40t-54.5 -31q-252 -126 -610 -88q-248 27 -394 139q-15 12 -25.5 26.5t-17 35t-9 34t-6 39.5
+t-5.5 35q-9 50 -26.5 150t-28 161.5t-23.5 147.5t-22 158q3 26 17.5 48.5t31.5 37.5t45 30t46 22.5t48 18.5q125 46 313 64q379 37 676 -50q155 -46 215 -122q16 -20 16.5 -51t-5.5 -54z" />
+    <glyph glyph-name="bitbucket_sign" unicode="&#xf172;" 
+d="M848 666q0 43 -41 66t-77 1q-43 -20 -42.5 -72.5t43.5 -70.5q39 -23 81 4t36 72zM928 682q8 -66 -36 -121t-110 -61t-119 40t-56 113q-2 49 25.5 93t72.5 64q70 31 141.5 -10t81.5 -118zM1100 1073q-20 -21 -53.5 -34t-53 -16t-63.5 -8q-155 -20 -324 0q-44 6 -63 9.5
+t-52.5 16t-54.5 32.5q13 19 36 31t40 15.5t47 8.5q198 35 408 1q33 -5 51 -8.5t43 -16t39 -31.5zM1142 327q0 7 5.5 26.5t3 32t-17.5 16.5q-161 -106 -365 -106t-366 106l-12 -6l-5 -12q26 -154 41 -210q47 -81 204 -108q249 -46 428 53q34 19 49 51.5t22.5 85.5t12.5 71z
+M1272 1020q9 53 -8 75q-43 55 -155 88q-216 63 -487 36q-132 -12 -226 -46q-38 -15 -59.5 -25t-47 -34t-29.5 -54q8 -68 19 -138t29 -171t24 -137q1 -5 5 -31t7 -36t12 -27t22 -28q105 -80 284 -100q259 -28 440 63q24 13 39.5 23t31 29t19.5 40q48 267 80 473zM1536 1120
+v-960q0 -119 -84.5 -203.5t-203.5 -84.5h-960q-119 0 -203.5 84.5t-84.5 203.5v960q0 119 84.5 203.5t203.5 84.5h960q119 0 203.5 -84.5t84.5 -203.5z" />
+    <glyph glyph-name="tumblr" unicode="&#xf173;" horiz-adv-x="1024" 
+d="M944 207l80 -237q-23 -35 -111 -66t-177 -32q-104 -2 -190.5 26t-142.5 74t-95 106t-55.5 120t-16.5 118v544h-168v215q72 26 129 69.5t91 90t58 102t34 99t15 88.5q1 5 4.5 8.5t7.5 3.5h244v-424h333v-252h-334v-518q0 -30 6.5 -56t22.5 -52.5t49.5 -41.5t81.5 -14
+q78 2 134 29z" />
+    <glyph glyph-name="tumblr_sign" unicode="&#xf174;" 
+d="M1136 75l-62 183q-44 -22 -103 -22q-36 -1 -62 10.5t-38.5 31.5t-17.5 40.5t-5 43.5v398h257v194h-256v326h-188q-8 0 -9 -10q-5 -44 -17.5 -87t-39 -95t-77 -95t-118.5 -68v-165h130v-418q0 -57 21.5 -115t65 -111t121 -85.5t176.5 -30.5q69 1 136.5 25t85.5 50z
+M1536 1120v-960q0 -119 -84.5 -203.5t-203.5 -84.5h-960q-119 0 -203.5 84.5t-84.5 203.5v960q0 119 84.5 203.5t203.5 84.5h960q119 0 203.5 -84.5t84.5 -203.5z" />
+    <glyph glyph-name="long_arrow_down" unicode="&#xf175;" horiz-adv-x="768" 
+d="M765 237q8 -19 -5 -35l-350 -384q-10 -10 -23 -10q-14 0 -24 10l-355 384q-13 16 -5 35q9 19 29 19h224v1248q0 14 9 23t23 9h192q14 0 23 -9t9 -23v-1248h224q21 0 29 -19z" />
+    <glyph glyph-name="long_arrow_up" unicode="&#xf176;" horiz-adv-x="768" 
+d="M765 1043q-9 -19 -29 -19h-224v-1248q0 -14 -9 -23t-23 -9h-192q-14 0 -23 9t-9 23v1248h-224q-21 0 -29 19t5 35l350 384q10 10 23 10q14 0 24 -10l355 -384q13 -16 5 -35z" />
+    <glyph glyph-name="long_arrow_left" unicode="&#xf177;" horiz-adv-x="1792" 
+d="M1792 736v-192q0 -14 -9 -23t-23 -9h-1248v-224q0 -21 -19 -29t-35 5l-384 350q-10 10 -10 23q0 14 10 24l384 354q16 14 35 6q19 -9 19 -29v-224h1248q14 0 23 -9t9 -23z" />
+    <glyph glyph-name="long_arrow_right" unicode="&#xf178;" horiz-adv-x="1792" 
+d="M1728 643q0 -14 -10 -24l-384 -354q-16 -14 -35 -6q-19 9 -19 29v224h-1248q-14 0 -23 9t-9 23v192q0 14 9 23t23 9h1248v224q0 21 19 29t35 -5l384 -350q10 -10 10 -23z" />
+    <glyph glyph-name="apple" unicode="&#xf179;" horiz-adv-x="1408" 
+d="M1393 321q-39 -125 -123 -250q-129 -196 -257 -196q-49 0 -140 32q-86 32 -151 32q-61 0 -142 -33q-81 -34 -132 -34q-152 0 -301 259q-147 261 -147 503q0 228 113 374q113 144 284 144q72 0 177 -30q104 -30 138 -30q45 0 143 34q102 34 173 34q119 0 213 -65
+q52 -36 104 -100q-79 -67 -114 -118q-65 -94 -65 -207q0 -124 69 -223t158 -126zM1017 1494q0 -61 -29 -136q-30 -75 -93 -138q-54 -54 -108 -72q-37 -11 -104 -17q3 149 78 257q74 107 250 148q1 -3 2.5 -11t2.5 -11q0 -4 0.5 -10t0.5 -10z" />
+    <glyph glyph-name="windows" unicode="&#xf17a;" horiz-adv-x="1664" 
+d="M682 530v-651l-682 94v557h682zM682 1273v-659h-682v565zM1664 530v-786l-907 125v661h907zM1664 1408v-794h-907v669z" />
+    <glyph glyph-name="android" unicode="&#xf17b;" horiz-adv-x="1408" 
+d="M493 1053q16 0 27.5 11.5t11.5 27.5t-11.5 27.5t-27.5 11.5t-27 -11.5t-11 -27.5t11 -27.5t27 -11.5zM915 1053q16 0 27 11.5t11 27.5t-11 27.5t-27 11.5t-27.5 -11.5t-11.5 -27.5t11.5 -27.5t27.5 -11.5zM103 869q42 0 72 -30t30 -72v-430q0 -43 -29.5 -73t-72.5 -30
+t-73 30t-30 73v430q0 42 30 72t73 30zM1163 850v-666q0 -46 -32 -78t-77 -32h-75v-227q0 -43 -30 -73t-73 -30t-73 30t-30 73v227h-138v-227q0 -43 -30 -73t-73 -30q-42 0 -72 30t-30 73l-1 227h-74q-46 0 -78 32t-32 78v666h918zM931 1255q107 -55 171 -153.5t64 -215.5
+h-925q0 117 64 215.5t172 153.5l-71 131q-7 13 5 20q13 6 20 -6l72 -132q95 42 201 42t201 -42l72 132q7 12 20 6q12 -7 5 -20zM1408 767v-430q0 -43 -30 -73t-73 -30q-42 0 -72 30t-30 73v430q0 43 30 72.5t72 29.5q43 0 73 -29.5t30 -72.5z" />
+    <glyph glyph-name="linux" unicode="&#xf17c;" 
+d="M663 1125q-11 -1 -15.5 -10.5t-8.5 -9.5q-5 -1 -5 5q0 12 19 15h10zM750 1111q-4 -1 -11.5 6.5t-17.5 4.5q24 11 32 -2q3 -6 -3 -9zM399 684q-4 1 -6 -3t-4.5 -12.5t-5.5 -13.5t-10 -13q-10 -11 -1 -12q4 -1 12.5 7t12.5 18q1 3 2 7t2 6t1.5 4.5t0.5 4v3t-1 2.5t-3 2z
+M1254 325q0 18 -55 42q4 15 7.5 27.5t5 26t3 21.5t0.5 22.5t-1 19.5t-3.5 22t-4 20.5t-5 25t-5.5 26.5q-10 48 -47 103t-72 75q24 -20 57 -83q87 -162 54 -278q-11 -40 -50 -42q-31 -4 -38.5 18.5t-8 83.5t-11.5 107q-9 39 -19.5 69t-19.5 45.5t-15.5 24.5t-13 15t-7.5 7
+q-14 62 -31 103t-29.5 56t-23.5 33t-15 40q-4 21 6 53.5t4.5 49.5t-44.5 25q-15 3 -44.5 18t-35.5 16q-8 1 -11 26t8 51t36 27q37 3 51 -30t4 -58q-11 -19 -2 -26.5t30 -0.5q13 4 13 36v37q-5 30 -13.5 50t-21 30.5t-23.5 15t-27 7.5q-107 -8 -89 -134q0 -15 -1 -15
+q-9 9 -29.5 10.5t-33 -0.5t-15.5 5q1 57 -16 90t-45 34q-27 1 -41.5 -27.5t-16.5 -59.5q-1 -15 3.5 -37t13 -37.5t15.5 -13.5q10 3 16 14q4 9 -7 8q-7 0 -15.5 14.5t-9.5 33.5q-1 22 9 37t34 14q17 0 27 -21t9.5 -39t-1.5 -22q-22 -15 -31 -29q-8 -12 -27.5 -23.5
+t-20.5 -12.5q-13 -14 -15.5 -27t7.5 -18q14 -8 25 -19.5t16 -19t18.5 -13t35.5 -6.5q47 -2 102 15q2 1 23 7t34.5 10.5t29.5 13t21 17.5q9 14 20 8q5 -3 6.5 -8.5t-3 -12t-16.5 -9.5q-20 -6 -56.5 -21.5t-45.5 -19.5q-44 -19 -70 -23q-25 -5 -79 2q-10 2 -9 -2t17 -19
+q25 -23 67 -22q17 1 36 7t36 14t33.5 17.5t30 17t24.5 12t17.5 2.5t8.5 -11q0 -2 -1 -4.5t-4 -5t-6 -4.5t-8.5 -5t-9 -4.5t-10 -5t-9.5 -4.5q-28 -14 -67.5 -44t-66.5 -43t-49 -1q-21 11 -63 73q-22 31 -25 22q-1 -3 -1 -10q0 -25 -15 -56.5t-29.5 -55.5t-21 -58t11.5 -63
+q-23 -6 -62.5 -90t-47.5 -141q-2 -18 -1.5 -69t-5.5 -59q-8 -24 -29 -3q-32 31 -36 94q-2 28 4 56q4 19 -1 18q-2 -1 -4 -5q-36 -65 10 -166q5 -12 25 -28t24 -20q20 -23 104 -90.5t93 -76.5q16 -15 17.5 -38t-14 -43t-45.5 -23q8 -15 29 -44.5t28 -54t7 -70.5q46 24 7 92
+q-4 8 -10.5 16t-9.5 12t-2 6q3 5 13 9.5t20 -2.5q46 -52 166 -36q133 15 177 87q23 38 34 30q12 -6 10 -52q-1 -25 -23 -92q-9 -23 -6 -37.5t24 -15.5q3 19 14.5 77t13.5 90q2 21 -6.5 73.5t-7.5 97t23 70.5q15 18 51 18q1 37 34.5 53t72.5 10.5t60 -22.5zM626 1152
+q3 17 -2.5 30t-11.5 15q-9 2 -9 -7q2 -5 5 -6q10 0 7 -15q-3 -20 8 -20q3 0 3 3zM1045 955q-2 8 -6.5 11.5t-13 5t-14.5 5.5q-5 3 -9.5 8t-7 8t-5.5 6.5t-4 4t-4 -1.5q-14 -16 7 -43.5t39 -31.5q9 -1 14.5 8t3.5 20zM867 1168q0 11 -5 19.5t-11 12.5t-9 3q-6 0 -8 -2t0 -4
+t5 -3q14 -4 18 -31q0 -3 8 2q2 2 2 3zM921 1401q0 2 -2.5 5t-9 7t-9.5 6q-15 15 -24 15q-9 -1 -11.5 -7.5t-1 -13t-0.5 -12.5q-1 -4 -6 -10.5t-6 -9t3 -8.5q4 -3 8 0t11 9t15 9q1 1 9 1t15 2t9 7zM1486 60q20 -12 31 -24.5t12 -24t-2.5 -22.5t-15.5 -22t-23.5 -19.5
+t-30 -18.5t-31.5 -16.5t-32 -15.5t-27 -13q-38 -19 -85.5 -56t-75.5 -64q-17 -16 -68 -19.5t-89 14.5q-18 9 -29.5 23.5t-16.5 25.5t-22 19.5t-47 9.5q-44 1 -130 1q-19 0 -57 -1.5t-58 -2.5q-44 -1 -79.5 -15t-53.5 -30t-43.5 -28.5t-53.5 -11.5q-29 1 -111 31t-146 43
+q-19 4 -51 9.5t-50 9t-39.5 9.5t-33.5 14.5t-17 19.5q-10 23 7 66.5t18 54.5q1 16 -4 40t-10 42.5t-4.5 36.5t10.5 27q14 12 57 14t60 12q30 18 42 35t12 51q21 -73 -32 -106q-32 -20 -83 -15q-34 3 -43 -10q-13 -15 5 -57q2 -6 8 -18t8.5 -18t4.5 -17t1 -22q0 -15 -17 -49
+t-14 -48q3 -17 37 -26q20 -6 84.5 -18.5t99.5 -20.5q24 -6 74 -22t82.5 -23t55.5 -4q43 6 64.5 28t23 48t-7.5 58.5t-19 52t-20 36.5q-121 190 -169 242q-68 74 -113 40q-11 -9 -15 15q-3 16 -2 38q1 29 10 52t24 47t22 42q8 21 26.5 72t29.5 78t30 61t39 54
+q110 143 124 195q-12 112 -16 310q-2 90 24 151.5t106 104.5q39 21 104 21q53 1 106 -13.5t89 -41.5q57 -42 91.5 -121.5t29.5 -147.5q-5 -95 30 -214q34 -113 133 -218q55 -59 99.5 -163t59.5 -191q8 -49 5 -84.5t-12 -55.5t-20 -22q-10 -2 -23.5 -19t-27 -35.5
+t-40.5 -33.5t-61 -14q-18 1 -31.5 5t-22.5 13.5t-13.5 15.5t-11.5 20.5t-9 19.5q-22 37 -41 30t-28 -49t7 -97q20 -70 1 -195q-10 -65 18 -100.5t73 -33t85 35.5q59 49 89.5 66.5t103.5 42.5q53 18 77 36.5t18.5 34.5t-25 28.5t-51.5 23.5q-33 11 -49.5 48t-15 72.5
+t15.5 47.5q1 -31 8 -56.5t14.5 -40.5t20.5 -28.5t21 -19t21.5 -13t16.5 -9.5z" />
+    <glyph glyph-name="dribble" unicode="&#xf17d;" 
+d="M1024 36q-42 241 -140 498h-2l-2 -1q-16 -6 -43 -16.5t-101 -49t-137 -82t-131 -114.5t-103 -148l-15 11q184 -150 418 -150q132 0 256 52zM839 643q-21 49 -53 111q-311 -93 -673 -93q-1 -7 -1 -21q0 -124 44 -236.5t124 -201.5q50 89 123.5 166.5t142.5 124.5t130.5 81
+t99.5 48l37 13q4 1 13 3.5t13 4.5zM732 855q-120 213 -244 378q-138 -65 -234 -186t-128 -272q302 0 606 80zM1416 536q-210 60 -409 29q87 -239 128 -469q111 75 185 189.5t96 250.5zM611 1277q-1 0 -2 -1q1 1 2 1zM1201 1132q-185 164 -433 164q-76 0 -155 -19
+q131 -170 246 -382q69 26 130 60.5t96.5 61.5t65.5 57t37.5 40.5zM1424 647q-3 232 -149 410l-1 -1q-9 -12 -19 -24.5t-43.5 -44.5t-71 -60.5t-100 -65t-131.5 -64.5q25 -53 44 -95q2 -5 6.5 -17t7.5 -17q36 5 74.5 7t73.5 2t69 -1.5t64 -4t56.5 -5.5t48 -6.5t36.5 -6
+t25 -4.5zM1536 640q0 -209 -103 -385.5t-279.5 -279.5t-385.5 -103t-385.5 103t-279.5 279.5t-103 385.5t103 385.5t279.5 279.5t385.5 103t385.5 -103t279.5 -279.5t103 -385.5z" />
+    <glyph glyph-name="skype" unicode="&#xf17e;" 
+d="M1173 473q0 50 -19.5 91.5t-48.5 68.5t-73 49t-82.5 34t-87.5 23l-104 24q-30 7 -44 10.5t-35 11.5t-30 16t-16.5 21t-7.5 30q0 77 144 77q43 0 77 -12t54 -28.5t38 -33.5t40 -29t48 -12q47 0 75.5 32t28.5 77q0 55 -56 99.5t-142 67.5t-182 23q-68 0 -132 -15.5
+t-119.5 -47t-89 -87t-33.5 -128.5q0 -61 19 -106.5t56 -75.5t80 -48.5t103 -32.5l146 -36q90 -22 112 -36q32 -20 32 -60q0 -39 -40 -64.5t-105 -25.5q-51 0 -91.5 16t-65 38.5t-45.5 45t-46 38.5t-54 16q-50 0 -75.5 -30t-25.5 -75q0 -92 122 -157.5t291 -65.5
+q73 0 140 18.5t122.5 53.5t88.5 93.5t33 131.5zM1536 256q0 -159 -112.5 -271.5t-271.5 -112.5q-130 0 -234 80q-77 -16 -150 -16q-143 0 -273.5 55.5t-225 150t-150 225t-55.5 273.5q0 73 16 150q-80 104 -80 234q0 159 112.5 271.5t271.5 112.5q130 0 234 -80
+q77 16 150 16q143 0 273.5 -55.5t225 -150t150 -225t55.5 -273.5q0 -73 -16 -150q80 -104 80 -234z" />
+    <glyph glyph-name="foursquare" unicode="&#xf180;" horiz-adv-x="1280" 
+d="M1000 1102l37 194q5 23 -9 40t-35 17h-712q-23 0 -38.5 -17t-15.5 -37v-1101q0 -7 6 -1l291 352q23 26 38 33.5t48 7.5h239q22 0 37 14.5t18 29.5q24 130 37 191q4 21 -11.5 40t-36.5 19h-294q-29 0 -48 19t-19 48v42q0 29 19 47.5t48 18.5h346q18 0 35 13.5t20 29.5z
+M1227 1324q-15 -73 -53.5 -266.5t-69.5 -350t-35 -173.5q-6 -22 -9 -32.5t-14 -32.5t-24.5 -33t-38.5 -21t-58 -10h-271q-13 0 -22 -10q-8 -9 -426 -494q-22 -25 -58.5 -28.5t-48.5 5.5q-55 22 -55 98v1410q0 55 38 102.5t120 47.5h888q95 0 127 -53t10 -159zM1227 1324
+l-158 -790q4 17 35 173.5t69.5 350t53.5 266.5z" />
+    <glyph glyph-name="trello" unicode="&#xf181;" 
+d="M704 192v1024q0 14 -9 23t-23 9h-480q-14 0 -23 -9t-9 -23v-1024q0 -14 9 -23t23 -9h480q14 0 23 9t9 23zM1376 576v640q0 14 -9 23t-23 9h-480q-14 0 -23 -9t-9 -23v-640q0 -14 9 -23t23 -9h480q14 0 23 9t9 23zM1536 1344v-1408q0 -26 -19 -45t-45 -19h-1408
+q-26 0 -45 19t-19 45v1408q0 26 19 45t45 19h1408q26 0 45 -19t19 -45z" />
+    <glyph glyph-name="female" unicode="&#xf182;" horiz-adv-x="1280" 
+d="M1280 480q0 -40 -28 -68t-68 -28q-51 0 -80 43l-227 341h-45v-132l247 -411q9 -15 9 -33q0 -26 -19 -45t-45 -19h-192v-272q0 -46 -33 -79t-79 -33h-160q-46 0 -79 33t-33 79v272h-192q-26 0 -45 19t-19 45q0 18 9 33l247 411v132h-45l-227 -341q-29 -43 -80 -43
+q-40 0 -68 28t-28 68q0 29 16 53l256 384q73 107 176 107h384q103 0 176 -107l256 -384q16 -24 16 -53zM864 1280q0 -93 -65.5 -158.5t-158.5 -65.5t-158.5 65.5t-65.5 158.5t65.5 158.5t158.5 65.5t158.5 -65.5t65.5 -158.5z" />
+    <glyph glyph-name="male" unicode="&#xf183;" horiz-adv-x="1024" 
+d="M1024 832v-416q0 -40 -28 -68t-68 -28t-68 28t-28 68v352h-64v-912q0 -46 -33 -79t-79 -33t-79 33t-33 79v464h-64v-464q0 -46 -33 -79t-79 -33t-79 33t-33 79v912h-64v-352q0 -40 -28 -68t-68 -28t-68 28t-28 68v416q0 80 56 136t136 56h640q80 0 136 -56t56 -136z
+M736 1280q0 -93 -65.5 -158.5t-158.5 -65.5t-158.5 65.5t-65.5 158.5t65.5 158.5t158.5 65.5t158.5 -65.5t65.5 -158.5z" />
+    <glyph glyph-name="gittip" unicode="&#xf184;" 
+d="M773 234l350 473q16 22 24.5 59t-6 85t-61.5 79q-40 26 -83 25.5t-73.5 -17.5t-54.5 -45q-36 -40 -96 -40q-59 0 -95 40q-24 28 -54.5 45t-73.5 17.5t-84 -25.5q-46 -31 -60.5 -79t-6 -85t24.5 -59zM1536 640q0 -209 -103 -385.5t-279.5 -279.5t-385.5 -103t-385.5 103
+t-279.5 279.5t-103 385.5t103 385.5t279.5 279.5t385.5 103t385.5 -103t279.5 -279.5t103 -385.5z" />
+    <glyph glyph-name="sun" unicode="&#xf185;" horiz-adv-x="1792" 
+d="M1472 640q0 117 -45.5 223.5t-123 184t-184 123t-223.5 45.5t-223.5 -45.5t-184 -123t-123 -184t-45.5 -223.5t45.5 -223.5t123 -184t184 -123t223.5 -45.5t223.5 45.5t184 123t123 184t45.5 223.5zM1748 363q-4 -15 -20 -20l-292 -96v-306q0 -16 -13 -26q-15 -10 -29 -4
+l-292 94l-180 -248q-10 -13 -26 -13t-26 13l-180 248l-292 -94q-14 -6 -29 4q-13 10 -13 26v306l-292 96q-16 5 -20 20q-5 17 4 29l180 248l-180 248q-9 13 -4 29q4 15 20 20l292 96v306q0 16 13 26q15 10 29 4l292 -94l180 248q9 12 26 12t26 -12l180 -248l292 94
+q14 6 29 -4q13 -10 13 -26v-306l292 -96q16 -5 20 -20q5 -16 -4 -29l-180 -248l180 -248q9 -12 4 -29z" />
+    <glyph glyph-name="_366" unicode="&#xf186;" 
+d="M1262 233q-54 -9 -110 -9q-182 0 -337 90t-245 245t-90 337q0 192 104 357q-201 -60 -328.5 -229t-127.5 -384q0 -130 51 -248.5t136.5 -204t204 -136.5t248.5 -51q144 0 273.5 61.5t220.5 171.5zM1465 318q-94 -203 -283.5 -324.5t-413.5 -121.5q-156 0 -298 61
+t-245 164t-164 245t-61 298q0 153 57.5 292.5t156 241.5t235.5 164.5t290 68.5q44 2 61 -39q18 -41 -15 -72q-86 -78 -131.5 -181.5t-45.5 -218.5q0 -148 73 -273t198 -198t273 -73q118 0 228 51q41 18 72 -13q14 -14 17.5 -34t-4.5 -38z" />
+    <glyph glyph-name="archive" unicode="&#xf187;" horiz-adv-x="1792" 
+d="M1088 704q0 26 -19 45t-45 19h-256q-26 0 -45 -19t-19 -45t19 -45t45 -19h256q26 0 45 19t19 45zM1664 896v-960q0 -26 -19 -45t-45 -19h-1408q-26 0 -45 19t-19 45v960q0 26 19 45t45 19h1408q26 0 45 -19t19 -45zM1728 1344v-256q0 -26 -19 -45t-45 -19h-1536
+q-26 0 -45 19t-19 45v256q0 26 19 45t45 19h1536q26 0 45 -19t19 -45z" />
+    <glyph glyph-name="bug" unicode="&#xf188;" horiz-adv-x="1664" 
+d="M1632 576q0 -26 -19 -45t-45 -19h-224q0 -171 -67 -290l208 -209q19 -19 19 -45t-19 -45q-18 -19 -45 -19t-45 19l-198 197q-5 -5 -15 -13t-42 -28.5t-65 -36.5t-82 -29t-97 -13v896h-128v-896q-51 0 -101.5 13.5t-87 33t-66 39t-43.5 32.5l-15 14l-183 -207
+q-20 -21 -48 -21q-24 0 -43 16q-19 18 -20.5 44.5t15.5 46.5l202 227q-58 114 -58 274h-224q-26 0 -45 19t-19 45t19 45t45 19h224v294l-173 173q-19 19 -19 45t19 45t45 19t45 -19l173 -173h844l173 173q19 19 45 19t45 -19t19 -45t-19 -45l-173 -173v-294h224q26 0 45 -19
+t19 -45zM1152 1152h-640q0 133 93.5 226.5t226.5 93.5t226.5 -93.5t93.5 -226.5z" />
+    <glyph glyph-name="vk" unicode="&#xf189;" horiz-adv-x="1920" 
+d="M1917 1016q23 -64 -150 -294q-24 -32 -65 -85q-40 -51 -55 -72t-30.5 -49.5t-12 -42t13 -34.5t32.5 -43t57 -53q4 -2 5 -4q141 -131 191 -221q3 -5 6.5 -12.5t7 -26.5t-0.5 -34t-25 -27.5t-59 -12.5l-256 -4q-24 -5 -56 5t-52 22l-20 12q-30 21 -70 64t-68.5 77.5t-61 58
+t-56.5 15.5q-3 -1 -8 -3.5t-17 -14.5t-21.5 -29.5t-17 -52t-6.5 -77.5q0 -15 -3.5 -27.5t-7.5 -18.5l-4 -5q-18 -19 -53 -22h-115q-71 -4 -146 16.5t-131.5 53t-103 66t-70.5 57.5l-25 24q-10 10 -27.5 30t-71.5 91t-106 151t-122.5 211t-130.5 272q-6 16 -6 27t3 16l4 6
+q15 19 57 19l274 2q12 -2 23 -6.5t16 -8.5l5 -3q16 -11 24 -32q20 -50 46 -103.5t41 -81.5l16 -29q29 -60 56 -104t48.5 -68.5t41.5 -38.5t34 -14t27 5q2 1 5 5t12 22t13.5 47t9.5 81t0 125q-2 40 -9 73t-14 46l-6 12q-25 34 -85 43q-13 2 5 24q16 19 38 30q53 26 239 24
+q82 -1 135 -13q20 -5 33.5 -13.5t20.5 -24t10.5 -32t3.5 -45.5t-1 -55t-2.5 -70.5t-1.5 -82.5q0 -11 -1 -42t-0.5 -48t3.5 -40.5t11.5 -39t22.5 -24.5q8 -2 17 -4t26 11t38 34.5t52 67t68 107.5q60 104 107 225q4 10 10 17.5t11 10.5l4 3l5 2.5t13 3t20 0.5l288 2
+q39 5 64 -2.5t31 -16.5z" />
+    <glyph glyph-name="weibo" unicode="&#xf18a;" horiz-adv-x="1792" 
+d="M675 252q21 34 11 69t-45 50q-34 14 -73 1t-60 -46q-22 -34 -13 -68.5t43 -50.5t74.5 -2.5t62.5 47.5zM769 373q8 13 3.5 26.5t-17.5 18.5q-14 5 -28.5 -0.5t-21.5 -18.5q-17 -31 13 -45q14 -5 29 0.5t22 18.5zM943 266q-45 -102 -158 -150t-224 -12
+q-107 34 -147.5 126.5t6.5 187.5q47 93 151.5 139t210.5 19q111 -29 158.5 -119.5t2.5 -190.5zM1255 426q-9 96 -89 170t-208.5 109t-274.5 21q-223 -23 -369.5 -141.5t-132.5 -264.5q9 -96 89 -170t208.5 -109t274.5 -21q223 23 369.5 141.5t132.5 264.5zM1563 422
+q0 -68 -37 -139.5t-109 -137t-168.5 -117.5t-226 -83t-270.5 -31t-275 33.5t-240.5 93t-171.5 151t-65 199.5q0 115 69.5 245t197.5 258q169 169 341.5 236t246.5 -7q65 -64 20 -209q-4 -14 -1 -20t10 -7t14.5 0.5t13.5 3.5l6 2q139 59 246 59t153 -61q45 -63 0 -178
+q-2 -13 -4.5 -20t4.5 -12.5t12 -7.5t17 -6q57 -18 103 -47t80 -81.5t34 -116.5zM1489 1046q42 -47 54.5 -108.5t-6.5 -117.5q-8 -23 -29.5 -34t-44.5 -4q-23 8 -34 29.5t-4 44.5q20 63 -24 111t-107 35q-24 -5 -45 8t-25 37q-5 24 8 44.5t37 25.5q60 13 119 -5.5t101 -65.5z
+M1670 1209q87 -96 112.5 -222.5t-13.5 -241.5q-9 -27 -34 -40t-52 -4t-40 34t-5 52q28 82 10 172t-80 158q-62 69 -148 95.5t-173 8.5q-28 -6 -52 9.5t-30 43.5t9.5 51.5t43.5 29.5q123 26 244 -11.5t208 -134.5z" />
+    <glyph glyph-name="renren" unicode="&#xf18b;" 
+d="M1133 -34q-171 -94 -368 -94q-196 0 -367 94q138 87 235.5 211t131.5 268q35 -144 132.5 -268t235.5 -211zM638 1394v-485q0 -252 -126.5 -459.5t-330.5 -306.5q-181 215 -181 495q0 187 83.5 349.5t229.5 269.5t325 137zM1536 638q0 -280 -181 -495
+q-204 99 -330.5 306.5t-126.5 459.5v485q179 -30 325 -137t229.5 -269.5t83.5 -349.5z" />
+    <glyph glyph-name="_372" unicode="&#xf18c;" horiz-adv-x="1408" 
+d="M1402 433q-32 -80 -76 -138t-91 -88.5t-99 -46.5t-101.5 -14.5t-96.5 8.5t-86.5 22t-69.5 27.5t-46 22.5l-17 10q-113 -228 -289.5 -359.5t-384.5 -132.5q-19 0 -32 13t-13 32t13 31.5t32 12.5q173 1 322.5 107.5t251.5 294.5q-36 -14 -72 -23t-83 -13t-91 2.5t-93 28.5
+t-92 59t-84.5 100t-74.5 146q114 47 214 57t167.5 -7.5t124.5 -56.5t88.5 -77t56.5 -82q53 131 79 291q-7 -1 -18 -2.5t-46.5 -2.5t-69.5 0.5t-81.5 10t-88.5 23t-84 42.5t-75 65t-54.5 94.5t-28.5 127.5q70 28 133.5 36.5t112.5 -1t92 -30t73.5 -50t56 -61t42 -63t27.5 -56
+t16 -39.5l4 -16q12 122 12 195q-8 6 -21.5 16t-49 44.5t-63.5 71.5t-54 93t-33 112.5t12 127t70 138.5q73 -25 127.5 -61.5t84.5 -76.5t48 -85t20.5 -89t-0.5 -85.5t-13 -76.5t-19 -62t-17 -42l-7 -15q1 -4 1 -50t-1 -72q3 7 10 18.5t30.5 43t50.5 58t71 55.5t91.5 44.5
+t112 14.5t132.5 -24q-2 -78 -21.5 -141.5t-50 -104.5t-69.5 -71.5t-81.5 -45.5t-84.5 -24t-80 -9.5t-67.5 1t-46.5 4.5l-17 3q-23 -147 -73 -283q6 7 18 18.5t49.5 41t77.5 52.5t99.5 42t117.5 20t129 -23.5t137 -77.5z" />
+    <glyph glyph-name="stack_exchange" unicode="&#xf18d;" horiz-adv-x="1280" 
+d="M1259 283v-66q0 -85 -57.5 -144.5t-138.5 -59.5h-57l-260 -269v269h-529q-81 0 -138.5 59.5t-57.5 144.5v66h1238zM1259 609v-255h-1238v255h1238zM1259 937v-255h-1238v255h1238zM1259 1077v-67h-1238v67q0 84 57.5 143.5t138.5 59.5h846q81 0 138.5 -59.5t57.5 -143.5z
+" />
+    <glyph glyph-name="_374" unicode="&#xf18e;" 
+d="M1152 640q0 -14 -9 -23l-320 -320q-9 -9 -23 -9q-13 0 -22.5 9.5t-9.5 22.5v192h-352q-13 0 -22.5 9.5t-9.5 22.5v192q0 13 9.5 22.5t22.5 9.5h352v192q0 14 9 23t23 9q12 0 24 -10l319 -319q9 -9 9 -23zM1312 640q0 148 -73 273t-198 198t-273 73t-273 -73t-198 -198
+t-73 -273t73 -273t198 -198t273 -73t273 73t198 198t73 273zM1536 640q0 -209 -103 -385.5t-279.5 -279.5t-385.5 -103t-385.5 103t-279.5 279.5t-103 385.5t103 385.5t279.5 279.5t385.5 103t385.5 -103t279.5 -279.5t103 -385.5z" />
+    <glyph glyph-name="arrow_circle_alt_left" unicode="&#xf190;" 
+d="M1152 736v-192q0 -13 -9.5 -22.5t-22.5 -9.5h-352v-192q0 -14 -9 -23t-23 -9q-12 0 -24 10l-319 319q-9 9 -9 23t9 23l320 320q9 9 23 9q13 0 22.5 -9.5t9.5 -22.5v-192h352q13 0 22.5 -9.5t9.5 -22.5zM1312 640q0 148 -73 273t-198 198t-273 73t-273 -73t-198 -198
+t-73 -273t73 -273t198 -198t273 -73t273 73t198 198t73 273zM1536 640q0 -209 -103 -385.5t-279.5 -279.5t-385.5 -103t-385.5 103t-279.5 279.5t-103 385.5t103 385.5t279.5 279.5t385.5 103t385.5 -103t279.5 -279.5t103 -385.5z" />
+    <glyph glyph-name="_376" unicode="&#xf191;" 
+d="M1024 960v-640q0 -26 -19 -45t-45 -19q-20 0 -37 12l-448 320q-27 19 -27 52t27 52l448 320q17 12 37 12q26 0 45 -19t19 -45zM1280 160v960q0 13 -9.5 22.5t-22.5 9.5h-960q-13 0 -22.5 -9.5t-9.5 -22.5v-960q0 -13 9.5 -22.5t22.5 -9.5h960q13 0 22.5 9.5t9.5 22.5z
+M1536 1120v-960q0 -119 -84.5 -203.5t-203.5 -84.5h-960q-119 0 -203.5 84.5t-84.5 203.5v960q0 119 84.5 203.5t203.5 84.5h960q119 0 203.5 -84.5t84.5 -203.5z" />
+    <glyph glyph-name="dot_circle_alt" unicode="&#xf192;" 
+d="M1024 640q0 -106 -75 -181t-181 -75t-181 75t-75 181t75 181t181 75t181 -75t75 -181zM768 1184q-148 0 -273 -73t-198 -198t-73 -273t73 -273t198 -198t273 -73t273 73t198 198t73 273t-73 273t-198 198t-273 73zM1536 640q0 -209 -103 -385.5t-279.5 -279.5
+t-385.5 -103t-385.5 103t-279.5 279.5t-103 385.5t103 385.5t279.5 279.5t385.5 103t385.5 -103t279.5 -279.5t103 -385.5z" />
+    <glyph glyph-name="_378" unicode="&#xf193;" horiz-adv-x="1664" 
+d="M1023 349l102 -204q-58 -179 -210 -290t-339 -111q-156 0 -288.5 77.5t-210 210t-77.5 288.5q0 181 104.5 330t274.5 211l17 -131q-122 -54 -195 -165.5t-73 -244.5q0 -185 131.5 -316.5t316.5 -131.5q126 0 232.5 65t165 175.5t49.5 236.5zM1571 249l58 -114l-256 -128
+q-13 -7 -29 -7q-40 0 -57 35l-239 477h-472q-24 0 -42.5 16.5t-21.5 40.5l-96 779q-2 17 6 42q14 51 57 82.5t97 31.5q66 0 113 -47t47 -113q0 -69 -52 -117.5t-120 -41.5l37 -289h423v-128h-407l16 -128h455q40 0 57 -35l228 -455z" />
+    <glyph glyph-name="vimeo_square" unicode="&#xf194;" 
+d="M1292 898q10 216 -161 222q-231 8 -312 -261q44 19 82 19q85 0 74 -96q-4 -57 -74 -167t-105 -110q-43 0 -82 169q-13 54 -45 255q-30 189 -160 177q-59 -7 -164 -100l-81 -72l-81 -72l52 -67q76 52 87 52q57 0 107 -179q15 -55 45 -164.5t45 -164.5q68 -179 164 -179
+q157 0 383 294q220 283 226 444zM1536 1120v-960q0 -119 -84.5 -203.5t-203.5 -84.5h-960q-119 0 -203.5 84.5t-84.5 203.5v960q0 119 84.5 203.5t203.5 84.5h960q119 0 203.5 -84.5t84.5 -203.5z" />
+    <glyph glyph-name="_380" unicode="&#xf195;" horiz-adv-x="1152" 
+d="M1152 704q0 -191 -94.5 -353t-256.5 -256.5t-353 -94.5h-160q-14 0 -23 9t-9 23v611l-215 -66q-3 -1 -9 -1q-10 0 -19 6q-13 10 -13 26v128q0 23 23 31l233 71v93l-215 -66q-3 -1 -9 -1q-10 0 -19 6q-13 10 -13 26v128q0 23 23 31l233 71v250q0 14 9 23t23 9h160
+q14 0 23 -9t9 -23v-181l375 116q15 5 28 -5t13 -26v-128q0 -23 -23 -31l-393 -121v-93l375 116q15 5 28 -5t13 -26v-128q0 -23 -23 -31l-393 -121v-487q188 13 318 151t130 328q0 14 9 23t23 9h160q14 0 23 -9t9 -23z" />
+    <glyph glyph-name="plus_square_o" unicode="&#xf196;" horiz-adv-x="1408" 
+d="M1152 736v-64q0 -14 -9 -23t-23 -9h-352v-352q0 -14 -9 -23t-23 -9h-64q-14 0 -23 9t-9 23v352h-352q-14 0 -23 9t-9 23v64q0 14 9 23t23 9h352v352q0 14 9 23t23 9h64q14 0 23 -9t9 -23v-352h352q14 0 23 -9t9 -23zM1280 288v832q0 66 -47 113t-113 47h-832
+q-66 0 -113 -47t-47 -113v-832q0 -66 47 -113t113 -47h832q66 0 113 47t47 113zM1408 1120v-832q0 -119 -84.5 -203.5t-203.5 -84.5h-832q-119 0 -203.5 84.5t-84.5 203.5v832q0 119 84.5 203.5t203.5 84.5h832q119 0 203.5 -84.5t84.5 -203.5z" />
+    <glyph glyph-name="_382" unicode="&#xf197;" horiz-adv-x="2176" 
+d="M620 416q-110 -64 -268 -64h-128v64h-64q-13 0 -22.5 23.5t-9.5 56.5q0 24 7 49q-58 2 -96.5 10.5t-38.5 20.5t38.5 20.5t96.5 10.5q-7 25 -7 49q0 33 9.5 56.5t22.5 23.5h64v64h128q158 0 268 -64h1113q42 -7 106.5 -18t80.5 -14q89 -15 150 -40.5t83.5 -47.5t22.5 -40
+t-22.5 -40t-83.5 -47.5t-150 -40.5q-16 -3 -80.5 -14t-106.5 -18h-1113zM1739 668q53 -36 53 -92t-53 -92l81 -30q68 48 68 122t-68 122zM625 400h1015q-217 -38 -456 -80q-57 0 -113 -24t-83 -48l-28 -24l-288 -288q-26 -26 -70.5 -45t-89.5 -19h-96l-93 464h29
+q157 0 273 64zM352 816h-29l93 464h96q46 0 90 -19t70 -45l288 -288q4 -4 11 -10.5t30.5 -23t48.5 -29t61.5 -23t72.5 -10.5l456 -80h-1015q-116 64 -273 64z" />
+    <glyph glyph-name="_383" unicode="&#xf198;" horiz-adv-x="1664" 
+d="M1519 760q62 0 103.5 -40.5t41.5 -101.5q0 -97 -93 -130l-172 -59l56 -167q7 -21 7 -47q0 -59 -42 -102t-101 -43q-47 0 -85.5 27t-53.5 72l-55 165l-310 -106l55 -164q8 -24 8 -47q0 -59 -42 -102t-102 -43q-47 0 -85 27t-53 72l-55 163l-153 -53q-29 -9 -50 -9
+q-61 0 -101.5 40t-40.5 101q0 47 27.5 85t71.5 53l156 53l-105 313l-156 -54q-26 -8 -48 -8q-60 0 -101 40.5t-41 100.5q0 47 27.5 85t71.5 53l157 53l-53 159q-8 24 -8 47q0 60 42 102.5t102 42.5q47 0 85 -27t53 -72l54 -160l310 105l-54 160q-8 24 -8 47q0 59 42.5 102
+t101.5 43q47 0 85.5 -27.5t53.5 -71.5l53 -161l162 55q21 6 43 6q60 0 102.5 -39.5t42.5 -98.5q0 -45 -30 -81.5t-74 -51.5l-157 -54l105 -316l164 56q24 8 46 8zM725 498l310 105l-105 315l-310 -107z" />
+    <glyph glyph-name="_384" unicode="&#xf199;" 
+d="M1248 1408q119 0 203.5 -84.5t84.5 -203.5v-960q0 -119 -84.5 -203.5t-203.5 -84.5h-960q-119 0 -203.5 84.5t-84.5 203.5v960q0 119 84.5 203.5t203.5 84.5h960zM1280 352v436q-31 -35 -64 -55q-34 -22 -132.5 -85t-151.5 -99q-98 -69 -164 -69v0v0q-66 0 -164 69
+q-47 32 -142 92.5t-142 92.5q-12 8 -33 27t-31 27v-436q0 -40 28 -68t68 -28h832q40 0 68 28t28 68zM1280 925q0 41 -27.5 70t-68.5 29h-832q-40 0 -68 -28t-28 -68q0 -37 30.5 -76.5t67.5 -64.5q47 -32 137.5 -89t129.5 -83q3 -2 17 -11.5t21 -14t21 -13t23.5 -13
+t21.5 -9.5t22.5 -7.5t20.5 -2.5t20.5 2.5t22.5 7.5t21.5 9.5t23.5 13t21 13t21 14t17 11.5l267 174q35 23 66.5 62.5t31.5 73.5z" />
+    <glyph glyph-name="_385" unicode="&#xf19a;" horiz-adv-x="1792" 
+d="M127 640q0 163 67 313l367 -1005q-196 95 -315 281t-119 411zM1415 679q0 -19 -2.5 -38.5t-10 -49.5t-11.5 -44t-17.5 -59t-17.5 -58l-76 -256l-278 826q46 3 88 8q19 2 26 18.5t-2.5 31t-28.5 13.5l-205 -10q-75 1 -202 10q-12 1 -20.5 -5t-11.5 -15t-1.5 -18.5t9 -16.5
+t19.5 -8l80 -8l120 -328l-168 -504l-280 832q46 3 88 8q19 2 26 18.5t-2.5 31t-28.5 13.5l-205 -10q-7 0 -23 0.5t-26 0.5q105 160 274.5 253.5t367.5 93.5q147 0 280.5 -53t238.5 -149h-10q-55 0 -92 -40.5t-37 -95.5q0 -12 2 -24t4 -21.5t8 -23t9 -21t12 -22.5t12.5 -21
+t14.5 -24t14 -23q63 -107 63 -212zM909 573l237 -647q1 -6 5 -11q-126 -44 -255 -44q-112 0 -217 32zM1570 1009q95 -174 95 -369q0 -209 -104 -385.5t-279 -278.5l235 678q59 169 59 276q0 42 -6 79zM896 1536q182 0 348 -71t286 -191t191 -286t71 -348t-71 -348t-191 -286
+t-286 -191t-348 -71t-348 71t-286 191t-191 286t-71 348t71 348t191 286t286 191t348 71zM896 -215q173 0 331.5 68t273 182.5t182.5 273t68 331.5t-68 331.5t-182.5 273t-273 182.5t-331.5 68t-331.5 -68t-273 -182.5t-182.5 -273t-68 -331.5t68 -331.5t182.5 -273
+t273 -182.5t331.5 -68z" />
+    <glyph glyph-name="_386" unicode="&#xf19b;" horiz-adv-x="1792" 
+d="M1086 1536v-1536l-272 -128q-228 20 -414 102t-293 208.5t-107 272.5q0 140 100.5 263.5t275 205.5t391.5 108v-172q-217 -38 -356.5 -150t-139.5 -255q0 -152 154.5 -267t388.5 -145v1360zM1755 954l37 -390l-525 114l147 83q-119 70 -280 99v172q277 -33 481 -157z" />
+    <glyph glyph-name="_387" unicode="&#xf19c;" horiz-adv-x="2048" 
+d="M960 1536l960 -384v-128h-128q0 -26 -20.5 -45t-48.5 -19h-1526q-28 0 -48.5 19t-20.5 45h-128v128zM256 896h256v-768h128v768h256v-768h128v768h256v-768h128v768h256v-768h59q28 0 48.5 -19t20.5 -45v-64h-1664v64q0 26 20.5 45t48.5 19h59v768zM1851 -64
+q28 0 48.5 -19t20.5 -45v-128h-1920v128q0 26 20.5 45t48.5 19h1782z" />
+    <glyph glyph-name="_388" unicode="&#xf19d;" horiz-adv-x="2304" 
+d="M1774 700l18 -316q4 -69 -82 -128t-235 -93.5t-323 -34.5t-323 34.5t-235 93.5t-82 128l18 316l574 -181q22 -7 48 -7t48 7zM2304 1024q0 -23 -22 -31l-1120 -352q-4 -1 -10 -1t-10 1l-652 206q-43 -34 -71 -111.5t-34 -178.5q63 -36 63 -109q0 -69 -58 -107l58 -433
+q2 -14 -8 -25q-9 -11 -24 -11h-192q-15 0 -24 11q-10 11 -8 25l58 433q-58 38 -58 107q0 73 65 111q11 207 98 330l-333 104q-22 8 -22 31t22 31l1120 352q4 1 10 1t10 -1l1120 -352q22 -8 22 -31z" />
+    <glyph glyph-name="_389" unicode="&#xf19e;" 
+d="M859 579l13 -707q-62 11 -105 11q-41 0 -105 -11l13 707q-40 69 -168.5 295.5t-216.5 374.5t-181 287q58 -15 108 -15q44 0 111 15q63 -111 133.5 -229.5t167 -276.5t138.5 -227q37 61 109.5 177.5t117.5 190t105 176t107 189.5q54 -14 107 -14q56 0 114 14v0
+q-28 -39 -60 -88.5t-49.5 -78.5t-56.5 -96t-49 -84q-146 -248 -353 -610z" />
+    <glyph glyph-name="uniF1A0" unicode="&#xf1a0;" 
+d="M768 750h725q12 -67 12 -128q0 -217 -91 -387.5t-259.5 -266.5t-386.5 -96q-157 0 -299 60.5t-245 163.5t-163.5 245t-60.5 299t60.5 299t163.5 245t245 163.5t299 60.5q300 0 515 -201l-209 -201q-123 119 -306 119q-129 0 -238.5 -65t-173.5 -176.5t-64 -243.5
+t64 -243.5t173.5 -176.5t238.5 -65q87 0 160 24t120 60t82 82t51.5 87t22.5 78h-436v264z" />
+    <glyph glyph-name="f1a1" unicode="&#xf1a1;" horiz-adv-x="1792" 
+d="M1095 369q16 -16 0 -31q-62 -62 -199 -62t-199 62q-16 15 0 31q6 6 15 6t15 -6q48 -49 169 -49q120 0 169 49q6 6 15 6t15 -6zM788 550q0 -37 -26 -63t-63 -26t-63.5 26t-26.5 63q0 38 26.5 64t63.5 26t63 -26.5t26 -63.5zM1183 550q0 -37 -26.5 -63t-63.5 -26t-63 26
+t-26 63t26 63.5t63 26.5t63.5 -26t26.5 -64zM1434 670q0 49 -35 84t-85 35t-86 -36q-130 90 -311 96l63 283l200 -45q0 -37 26 -63t63 -26t63.5 26.5t26.5 63.5t-26.5 63.5t-63.5 26.5q-54 0 -80 -50l-221 49q-19 5 -25 -16l-69 -312q-180 -7 -309 -97q-35 37 -87 37
+q-50 0 -85 -35t-35 -84q0 -35 18.5 -64t49.5 -44q-6 -27 -6 -56q0 -142 140 -243t337 -101q198 0 338 101t140 243q0 32 -7 57q30 15 48 43.5t18 63.5zM1792 640q0 -182 -71 -348t-191 -286t-286 -191t-348 -71t-348 71t-286 191t-191 286t-71 348t71 348t191 286t286 191
+t348 71t348 -71t286 -191t191 -286t71 -348z" />
+    <glyph glyph-name="_392" unicode="&#xf1a2;" 
+d="M939 407q13 -13 0 -26q-53 -53 -171 -53t-171 53q-13 13 0 26q5 6 13 6t13 -6q42 -42 145 -42t145 42q5 6 13 6t13 -6zM676 563q0 -31 -23 -54t-54 -23t-54 23t-23 54q0 32 22.5 54.5t54.5 22.5t54.5 -22.5t22.5 -54.5zM1014 563q0 -31 -23 -54t-54 -23t-54 23t-23 54
+q0 32 22.5 54.5t54.5 22.5t54.5 -22.5t22.5 -54.5zM1229 666q0 42 -30 72t-73 30q-42 0 -73 -31q-113 78 -267 82l54 243l171 -39q1 -32 23.5 -54t53.5 -22q32 0 54.5 22.5t22.5 54.5t-22.5 54.5t-54.5 22.5q-48 0 -69 -43l-189 42q-17 5 -21 -13l-60 -268q-154 -6 -265 -83
+q-30 32 -74 32q-43 0 -73 -30t-30 -72q0 -30 16 -55t42 -38q-5 -25 -5 -48q0 -122 120 -208.5t289 -86.5q170 0 290 86.5t120 208.5q0 25 -6 49q25 13 40.5 37.5t15.5 54.5zM1536 1120v-960q0 -119 -84.5 -203.5t-203.5 -84.5h-960q-119 0 -203.5 84.5t-84.5 203.5v960
+q0 119 84.5 203.5t203.5 84.5h960q119 0 203.5 -84.5t84.5 -203.5z" />
+    <glyph glyph-name="_393" unicode="&#xf1a3;" 
+d="M866 697l90 27v62q0 79 -58 135t-138 56t-138 -55.5t-58 -134.5v-283q0 -20 -14 -33.5t-33 -13.5t-32.5 13.5t-13.5 33.5v120h-151v-122q0 -82 57.5 -139t139.5 -57q81 0 138.5 56.5t57.5 136.5v280q0 19 13.5 33t33.5 14q19 0 32.5 -14t13.5 -33v-54zM1199 502v122h-150
+v-126q0 -20 -13.5 -33.5t-33.5 -13.5q-19 0 -32.5 14t-13.5 33v123l-90 -26l-60 28v-123q0 -80 58 -137t139 -57t138.5 57t57.5 139zM1536 640q0 -209 -103 -385.5t-279.5 -279.5t-385.5 -103t-385.5 103t-279.5 279.5t-103 385.5t103 385.5t279.5 279.5t385.5 103
+t385.5 -103t279.5 -279.5t103 -385.5z" />
+    <glyph glyph-name="f1a4" unicode="&#xf1a4;" horiz-adv-x="1920" 
+d="M1062 824v118q0 42 -30 72t-72 30t-72 -30t-30 -72v-612q0 -175 -126 -299t-303 -124q-178 0 -303.5 125.5t-125.5 303.5v266h328v-262q0 -43 30 -72.5t72 -29.5t72 29.5t30 72.5v620q0 171 126.5 292t301.5 121q176 0 302 -122t126 -294v-136l-195 -58zM1592 602h328
+v-266q0 -178 -125.5 -303.5t-303.5 -125.5q-177 0 -303 124.5t-126 300.5v268l131 -61l195 58v-270q0 -42 30 -71.5t72 -29.5t72 29.5t30 71.5v275z" />
+    <glyph glyph-name="_395" unicode="&#xf1a5;" 
+d="M1472 160v480h-704v704h-480q-93 0 -158.5 -65.5t-65.5 -158.5v-480h704v-704h480q93 0 158.5 65.5t65.5 158.5zM1536 1120v-960q0 -119 -84.5 -203.5t-203.5 -84.5h-960q-119 0 -203.5 84.5t-84.5 203.5v960q0 119 84.5 203.5t203.5 84.5h960q119 0 203.5 -84.5
+t84.5 -203.5z" />
+    <glyph glyph-name="_396" unicode="&#xf1a6;" horiz-adv-x="2048" 
+d="M328 1254h204v-983h-532v697h328v286zM328 435v369h-123v-369h123zM614 968v-697h205v697h-205zM614 1254v-204h205v204h-205zM901 968h533v-942h-533v163h328v82h-328v697zM1229 435v369h-123v-369h123zM1516 968h532v-942h-532v163h327v82h-327v697zM1843 435v369h-123
+v-369h123z" />
+    <glyph glyph-name="_397" unicode="&#xf1a7;" 
+d="M1046 516q0 -64 -38 -109t-91 -45q-43 0 -70 15v277q28 17 70 17q53 0 91 -45.5t38 -109.5zM703 944q0 -64 -38 -109.5t-91 -45.5q-43 0 -70 15v277q28 17 70 17q53 0 91 -45t38 -109zM1265 513q0 134 -88 229t-213 95q-20 0 -39 -3q-23 -78 -78 -136q-87 -95 -211 -101
+v-636l211 41v206q51 -19 117 -19q125 0 213 95t88 229zM922 940q0 134 -88.5 229t-213.5 95q-74 0 -141 -36h-186v-840l211 41v206q55 -19 116 -19q125 0 213.5 95t88.5 229zM1536 1120v-960q0 -119 -84.5 -203.5t-203.5 -84.5h-960q-119 0 -203.5 84.5t-84.5 203.5v960
+q0 119 84.5 203.5t203.5 84.5h960q119 0 203.5 -84.5t84.5 -203.5z" />
+    <glyph glyph-name="_398" unicode="&#xf1a8;" horiz-adv-x="2038" 
+d="M1222 607q75 3 143.5 -20.5t118 -58.5t101 -94.5t84 -108t75.5 -120.5q33 -56 78.5 -109t75.5 -80.5t99 -88.5q-48 -30 -108.5 -57.5t-138.5 -59t-114 -47.5q-44 37 -74 115t-43.5 164.5t-33 180.5t-42.5 168.5t-72.5 123t-122.5 48.5l-10 -2l-6 -4q4 -5 13 -14
+q6 -5 28 -23.5t25.5 -22t19 -18t18 -20.5t11.5 -21t10.5 -27.5t4.5 -31t4 -40.5l1 -33q1 -26 -2.5 -57.5t-7.5 -52t-12.5 -58.5t-11.5 -53q-35 1 -101 -9.5t-98 -10.5q-39 0 -72 10q-2 16 -2 47q0 74 3 96q2 13 31.5 41.5t57 59t26.5 51.5q-24 2 -43 -24
+q-36 -53 -111.5 -99.5t-136.5 -46.5q-25 0 -75.5 63t-106.5 139.5t-84 96.5q-6 4 -27 30q-482 -112 -513 -112q-16 0 -28 11t-12 27q0 15 8.5 26.5t22.5 14.5l486 106q-8 14 -8 25t5.5 17.5t16 11.5t20 7t23 4.5t18.5 4.5q4 1 15.5 7.5t17.5 6.5q15 0 28 -16t20 -33
+q163 37 172 37q17 0 29.5 -11t12.5 -28q0 -15 -8.5 -26t-23.5 -14l-182 -40l-1 -16q-1 -26 81.5 -117.5t104.5 -91.5q47 0 119 80t72 129q0 36 -23.5 53t-51 18.5t-51 11.5t-23.5 34q0 16 10 34l-68 19q43 44 43 117q0 26 -5 58q82 16 144 16q44 0 71.5 -1.5t48.5 -8.5
+t31 -13.5t20.5 -24.5t15.5 -33.5t17 -47.5t24 -60l50 25q-3 -40 -23 -60t-42.5 -21t-40 -6.5t-16.5 -20.5zM1282 842q-5 5 -13.5 15.5t-12 14.5t-10.5 11.5t-10 10.5l-8 8t-8.5 7.5t-8 5t-8.5 4.5q-7 3 -14.5 5t-20.5 2.5t-22 0.5h-32.5h-37.5q-126 0 -217 -43
+q16 30 36 46.5t54 29.5t65.5 36t46 36.5t50 55t43.5 50.5q12 -9 28 -31.5t32 -36.5t38 -13l12 1v-76l22 -1q247 95 371 190q28 21 50 39t42.5 37.5t33 31t29.5 34t24 31t24.5 37t23 38t27 47.5t29.5 53l7 9q-2 -53 -43 -139q-79 -165 -205 -264t-306 -142q-14 -3 -42 -7.5
+t-50 -9.5t-39 -14q3 -19 24.5 -46t21.5 -34q0 -11 -26 -30zM1061 -79q39 26 131.5 47.5t146.5 21.5q9 0 22.5 -15.5t28 -42.5t26 -50t24 -51t14.5 -33q-121 -45 -244 -45q-61 0 -125 11zM822 568l48 12l109 -177l-73 -48zM1323 51q3 -15 3 -16q0 -7 -17.5 -14.5t-46 -13
+t-54 -9.5t-53.5 -7.5t-32 -4.5l-7 43q21 2 60.5 8.5t72 10t60.5 3.5h14zM866 679l-96 -20l-6 17q10 1 32.5 7t34.5 6q19 0 35 -10zM1061 45h31l10 -83l-41 -12v95zM1950 1535v1v-1zM1950 1535l-1 -5l-2 -2l1 3zM1950 1535l1 1z" />
+    <glyph glyph-name="_399" unicode="&#xf1a9;" 
+d="M1167 -50q-5 19 -24 5q-30 -22 -87 -39t-131 -17q-129 0 -193 49q-5 4 -13 4q-11 0 -26 -12q-7 -6 -7.5 -16t7.5 -20q34 -32 87.5 -46t102.5 -12.5t99 4.5q41 4 84.5 20.5t65 30t28.5 20.5q12 12 7 29zM1128 65q-19 47 -39 61q-23 15 -76 15q-47 0 -71 -10
+q-29 -12 -78 -56q-26 -24 -12 -44q9 -8 17.5 -4.5t31.5 23.5q3 2 10.5 8.5t10.5 8.5t10 7t11.5 7t12.5 5t15 4.5t16.5 2.5t20.5 1q27 0 44.5 -7.5t23 -14.5t13.5 -22q10 -17 12.5 -20t12.5 1q23 12 14 34zM1483 346q0 22 -5 44.5t-16.5 45t-34 36.5t-52.5 14
+q-33 0 -97 -41.5t-129 -83.5t-101 -42q-27 -1 -63.5 19t-76 49t-83.5 58t-100 49t-111 19q-115 -1 -197 -78.5t-84 -178.5q-2 -112 74 -164q29 -20 62.5 -28.5t103.5 -8.5q57 0 132 32.5t134 71t120 70.5t93 31q26 -1 65 -31.5t71.5 -67t68 -67.5t55.5 -32q35 -3 58.5 14
+t55.5 63q28 41 42.5 101t14.5 106zM1536 506q0 -164 -62 -304.5t-166 -236t-242.5 -149.5t-290.5 -54t-293 57.5t-247.5 157t-170.5 241.5t-64 302q0 89 19.5 172.5t49 145.5t70.5 118.5t78.5 94t78.5 69.5t64.5 46.5t42.5 24.5q14 8 51 26.5t54.5 28.5t48 30t60.5 44
+q36 28 58 72.5t30 125.5q129 -155 186 -193q44 -29 130 -68t129 -66q21 -13 39 -25t60.5 -46.5t76 -70.5t75 -95t69 -122t47 -148.5t19.5 -177.5z" />
+    <glyph glyph-name="_400" unicode="&#xf1aa;" 
+d="M1070 463l-160 -160l-151 -152l-30 -30q-65 -64 -151.5 -87t-171.5 -2q-16 -70 -72 -115t-129 -45q-85 0 -145 60.5t-60 145.5q0 72 44.5 128t113.5 72q-22 86 1 173t88 152l12 12l151 -152l-11 -11q-37 -37 -37 -89t37 -90q37 -37 89 -37t89 37l30 30l151 152l161 160z
+M729 1145l12 -12l-152 -152l-12 12q-37 37 -89 37t-89 -37t-37 -89.5t37 -89.5l29 -29l152 -152l160 -160l-151 -152l-161 160l-151 152l-30 30q-68 67 -90 159.5t5 179.5q-70 15 -115 71t-45 129q0 85 60 145.5t145 60.5q76 0 133.5 -49t69.5 -123q84 20 169.5 -3.5
+t149.5 -87.5zM1536 78q0 -85 -60 -145.5t-145 -60.5q-74 0 -131 47t-71 118q-86 -28 -179.5 -6t-161.5 90l-11 12l151 152l12 -12q37 -37 89 -37t89 37t37 89t-37 89l-30 30l-152 152l-160 160l152 152l160 -160l152 -152l29 -30q64 -64 87.5 -150.5t2.5 -171.5
+q76 -11 126.5 -68.5t50.5 -134.5zM1534 1202q0 -77 -51 -135t-127 -69q26 -85 3 -176.5t-90 -158.5l-12 -12l-151 152l12 12q37 37 37 89t-37 89t-89 37t-89 -37l-30 -30l-152 -152l-160 -160l-152 152l161 160l152 152l29 30q67 67 159 89.5t178 -3.5q11 75 68.5 126
+t135.5 51q85 0 145 -60.5t60 -145.5z" />
+    <glyph glyph-name="f1ab" unicode="&#xf1ab;" 
+d="M654 458q-1 -3 -12.5 0.5t-31.5 11.5l-20 9q-44 20 -87 49q-7 5 -41 31.5t-38 28.5q-67 -103 -134 -181q-81 -95 -105 -110q-4 -2 -19.5 -4t-18.5 0q6 4 82 92q21 24 85.5 115t78.5 118q17 30 51 98.5t36 77.5q-8 1 -110 -33q-8 -2 -27.5 -7.5t-34.5 -9.5t-17 -5
+q-2 -2 -2 -10.5t-1 -9.5q-5 -10 -31 -15q-23 -7 -47 0q-18 4 -28 21q-4 6 -5 23q6 2 24.5 5t29.5 6q58 16 105 32q100 35 102 35q10 2 43 19.5t44 21.5q9 3 21.5 8t14.5 5.5t6 -0.5q2 -12 -1 -33q0 -2 -12.5 -27t-26.5 -53.5t-17 -33.5q-25 -50 -77 -131l64 -28
+q12 -6 74.5 -32t67.5 -28q4 -1 10.5 -25.5t4.5 -30.5zM449 944q3 -15 -4 -28q-12 -23 -50 -38q-30 -12 -60 -12q-26 3 -49 26q-14 15 -18 41l1 3q3 -3 19.5 -5t26.5 0t58 16q36 12 55 14q17 0 21 -17zM1147 815l63 -227l-139 42zM39 15l694 232v1032l-694 -233v-1031z
+M1280 332l102 -31l-181 657l-100 31l-216 -536l102 -31l45 110l211 -65zM777 1294l573 -184v380zM1088 -29l158 -13l-54 -160l-40 66q-130 -83 -276 -108q-58 -12 -91 -12h-84q-79 0 -199.5 39t-183.5 85q-8 7 -8 16q0 8 5 13.5t13 5.5q4 0 18 -7.5t30.5 -16.5t20.5 -11
+q73 -37 159.5 -61.5t157.5 -24.5q95 0 167 14.5t157 50.5q15 7 30.5 15.5t34 19t28.5 16.5zM1536 1050v-1079l-774 246q-14 -6 -375 -127.5t-368 -121.5q-13 0 -18 13q0 1 -1 3v1078q3 9 4 10q5 6 20 11q107 36 149 50v384l558 -198q2 0 160.5 55t316 108.5t161.5 53.5
+q20 0 20 -21v-418z" />
+    <glyph glyph-name="_402" unicode="&#xf1ac;" horiz-adv-x="1792" 
+d="M288 1152q66 0 113 -47t47 -113v-1088q0 -66 -47 -113t-113 -47h-128q-66 0 -113 47t-47 113v1088q0 66 47 113t113 47h128zM1664 989q58 -34 93 -93t35 -128v-768q0 -106 -75 -181t-181 -75h-864q-66 0 -113 47t-47 113v1536q0 40 28 68t68 28h672q40 0 88 -20t76 -48
+l152 -152q28 -28 48 -76t20 -88v-163zM928 0v128q0 14 -9 23t-23 9h-128q-14 0 -23 -9t-9 -23v-128q0 -14 9 -23t23 -9h128q14 0 23 9t9 23zM928 256v128q0 14 -9 23t-23 9h-128q-14 0 -23 -9t-9 -23v-128q0 -14 9 -23t23 -9h128q14 0 23 9t9 23zM928 512v128q0 14 -9 23
+t-23 9h-128q-14 0 -23 -9t-9 -23v-128q0 -14 9 -23t23 -9h128q14 0 23 9t9 23zM1184 0v128q0 14 -9 23t-23 9h-128q-14 0 -23 -9t-9 -23v-128q0 -14 9 -23t23 -9h128q14 0 23 9t9 23zM1184 256v128q0 14 -9 23t-23 9h-128q-14 0 -23 -9t-9 -23v-128q0 -14 9 -23t23 -9h128
+q14 0 23 9t9 23zM1184 512v128q0 14 -9 23t-23 9h-128q-14 0 -23 -9t-9 -23v-128q0 -14 9 -23t23 -9h128q14 0 23 9t9 23zM1440 0v128q0 14 -9 23t-23 9h-128q-14 0 -23 -9t-9 -23v-128q0 -14 9 -23t23 -9h128q14 0 23 9t9 23zM1440 256v128q0 14 -9 23t-23 9h-128
+q-14 0 -23 -9t-9 -23v-128q0 -14 9 -23t23 -9h128q14 0 23 9t9 23zM1440 512v128q0 14 -9 23t-23 9h-128q-14 0 -23 -9t-9 -23v-128q0 -14 9 -23t23 -9h128q14 0 23 9t9 23zM1536 896v256h-160q-40 0 -68 28t-28 68v160h-640v-512h896z" />
+    <glyph glyph-name="_403" unicode="&#xf1ad;" 
+d="M1344 1536q26 0 45 -19t19 -45v-1664q0 -26 -19 -45t-45 -19h-1280q-26 0 -45 19t-19 45v1664q0 26 19 45t45 19h1280zM512 1248v-64q0 -14 9 -23t23 -9h64q14 0 23 9t9 23v64q0 14 -9 23t-23 9h-64q-14 0 -23 -9t-9 -23zM512 992v-64q0 -14 9 -23t23 -9h64q14 0 23 9
+t9 23v64q0 14 -9 23t-23 9h-64q-14 0 -23 -9t-9 -23zM512 736v-64q0 -14 9 -23t23 -9h64q14 0 23 9t9 23v64q0 14 -9 23t-23 9h-64q-14 0 -23 -9t-9 -23zM512 480v-64q0 -14 9 -23t23 -9h64q14 0 23 9t9 23v64q0 14 -9 23t-23 9h-64q-14 0 -23 -9t-9 -23zM384 160v64
+q0 14 -9 23t-23 9h-64q-14 0 -23 -9t-9 -23v-64q0 -14 9 -23t23 -9h64q14 0 23 9t9 23zM384 416v64q0 14 -9 23t-23 9h-64q-14 0 -23 -9t-9 -23v-64q0 -14 9 -23t23 -9h64q14 0 23 9t9 23zM384 672v64q0 14 -9 23t-23 9h-64q-14 0 -23 -9t-9 -23v-64q0 -14 9 -23t23 -9h64
+q14 0 23 9t9 23zM384 928v64q0 14 -9 23t-23 9h-64q-14 0 -23 -9t-9 -23v-64q0 -14 9 -23t23 -9h64q14 0 23 9t9 23zM384 1184v64q0 14 -9 23t-23 9h-64q-14 0 -23 -9t-9 -23v-64q0 -14 9 -23t23 -9h64q14 0 23 9t9 23zM896 -96v192q0 14 -9 23t-23 9h-320q-14 0 -23 -9
+t-9 -23v-192q0 -14 9 -23t23 -9h320q14 0 23 9t9 23zM896 416v64q0 14 -9 23t-23 9h-64q-14 0 -23 -9t-9 -23v-64q0 -14 9 -23t23 -9h64q14 0 23 9t9 23zM896 672v64q0 14 -9 23t-23 9h-64q-14 0 -23 -9t-9 -23v-64q0 -14 9 -23t23 -9h64q14 0 23 9t9 23zM896 928v64
+q0 14 -9 23t-23 9h-64q-14 0 -23 -9t-9 -23v-64q0 -14 9 -23t23 -9h64q14 0 23 9t9 23zM896 1184v64q0 14 -9 23t-23 9h-64q-14 0 -23 -9t-9 -23v-64q0 -14 9 -23t23 -9h64q14 0 23 9t9 23zM1152 160v64q0 14 -9 23t-23 9h-64q-14 0 -23 -9t-9 -23v-64q0 -14 9 -23t23 -9h64
+q14 0 23 9t9 23zM1152 416v64q0 14 -9 23t-23 9h-64q-14 0 -23 -9t-9 -23v-64q0 -14 9 -23t23 -9h64q14 0 23 9t9 23zM1152 672v64q0 14 -9 23t-23 9h-64q-14 0 -23 -9t-9 -23v-64q0 -14 9 -23t23 -9h64q14 0 23 9t9 23zM1152 928v64q0 14 -9 23t-23 9h-64q-14 0 -23 -9
+t-9 -23v-64q0 -14 9 -23t23 -9h64q14 0 23 9t9 23zM1152 1184v64q0 14 -9 23t-23 9h-64q-14 0 -23 -9t-9 -23v-64q0 -14 9 -23t23 -9h64q14 0 23 9t9 23z" />
+    <glyph glyph-name="_404" unicode="&#xf1ae;" horiz-adv-x="1280" 
+d="M1188 988l-292 -292v-824q0 -46 -33 -79t-79 -33t-79 33t-33 79v384h-64v-384q0 -46 -33 -79t-79 -33t-79 33t-33 79v824l-292 292q-28 28 -28 68t28 68q29 28 68.5 28t67.5 -28l228 -228h368l228 228q28 28 68 28t68 -28q28 -29 28 -68.5t-28 -67.5zM864 1152
+q0 -93 -65.5 -158.5t-158.5 -65.5t-158.5 65.5t-65.5 158.5t65.5 158.5t158.5 65.5t158.5 -65.5t65.5 -158.5z" />
+    <glyph glyph-name="uniF1B1" unicode="&#xf1b0;" horiz-adv-x="1664" 
+d="M780 1064q0 -60 -19 -113.5t-63 -92.5t-105 -39q-76 0 -138 57.5t-92 135.5t-30 151q0 60 19 113.5t63 92.5t105 39q77 0 138.5 -57.5t91.5 -135t30 -151.5zM438 581q0 -80 -42 -139t-119 -59q-76 0 -141.5 55.5t-100.5 133.5t-35 152q0 80 42 139.5t119 59.5
+q76 0 141.5 -55.5t100.5 -134t35 -152.5zM832 608q118 0 255 -97.5t229 -237t92 -254.5q0 -46 -17 -76.5t-48.5 -45t-64.5 -20t-76 -5.5q-68 0 -187.5 45t-182.5 45q-66 0 -192.5 -44.5t-200.5 -44.5q-183 0 -183 146q0 86 56 191.5t139.5 192.5t187.5 146t193 59zM1071 819
+q-61 0 -105 39t-63 92.5t-19 113.5q0 74 30 151.5t91.5 135t138.5 57.5q61 0 105 -39t63 -92.5t19 -113.5q0 -73 -30 -151t-92 -135.5t-138 -57.5zM1503 923q77 0 119 -59.5t42 -139.5q0 -74 -35 -152t-100.5 -133.5t-141.5 -55.5q-77 0 -119 59t-42 139q0 74 35 152.5
+t100.5 134t141.5 55.5z" />
+    <glyph glyph-name="_406" unicode="&#xf1b1;" horiz-adv-x="768" 
+d="M704 1008q0 -145 -57 -243.5t-152 -135.5l45 -821q2 -26 -16 -45t-44 -19h-192q-26 0 -44 19t-16 45l45 821q-95 37 -152 135.5t-57 243.5q0 128 42.5 249.5t117.5 200t160 78.5t160 -78.5t117.5 -200t42.5 -249.5z" />
+    <glyph glyph-name="_407" unicode="&#xf1b2;" horiz-adv-x="1792" 
+d="M896 -93l640 349v636l-640 -233v-752zM832 772l698 254l-698 254l-698 -254zM1664 1024v-768q0 -35 -18 -65t-49 -47l-704 -384q-28 -16 -61 -16t-61 16l-704 384q-31 17 -49 47t-18 65v768q0 40 23 73t61 47l704 256q22 8 44 8t44 -8l704 -256q38 -14 61 -47t23 -73z
+" />
+    <glyph glyph-name="_408" unicode="&#xf1b3;" horiz-adv-x="2304" 
+d="M640 -96l384 192v314l-384 -164v-342zM576 358l404 173l-404 173l-404 -173zM1664 -96l384 192v314l-384 -164v-342zM1600 358l404 173l-404 173l-404 -173zM1152 651l384 165v266l-384 -164v-267zM1088 1030l441 189l-441 189l-441 -189zM2176 512v-416q0 -36 -19 -67
+t-52 -47l-448 -224q-25 -14 -57 -14t-57 14l-448 224q-4 2 -7 4q-2 -2 -7 -4l-448 -224q-25 -14 -57 -14t-57 14l-448 224q-33 16 -52 47t-19 67v416q0 38 21.5 70t56.5 48l434 186v400q0 38 21.5 70t56.5 48l448 192q23 10 50 10t50 -10l448 -192q35 -16 56.5 -48t21.5 -70
+v-400l434 -186q36 -16 57 -48t21 -70z" />
+    <glyph glyph-name="_409" unicode="&#xf1b4;" horiz-adv-x="2048" 
+d="M1848 1197h-511v-124h511v124zM1596 771q-90 0 -146 -52.5t-62 -142.5h408q-18 195 -200 195zM1612 186q63 0 122 32t76 87h221q-100 -307 -427 -307q-214 0 -340.5 132t-126.5 347q0 208 130.5 345.5t336.5 137.5q138 0 240.5 -68t153 -179t50.5 -248q0 -17 -2 -47h-658
+q0 -111 57.5 -171.5t166.5 -60.5zM277 236h296q205 0 205 167q0 180 -199 180h-302v-347zM277 773h281q78 0 123.5 36.5t45.5 113.5q0 144 -190 144h-260v-294zM0 1282h594q87 0 155 -14t126.5 -47.5t90 -96.5t31.5 -154q0 -181 -172 -263q114 -32 172 -115t58 -204
+q0 -75 -24.5 -136.5t-66 -103.5t-98.5 -71t-121 -42t-134 -13h-611v1260z" />
+    <glyph glyph-name="_410" unicode="&#xf1b5;" 
+d="M1248 1408q119 0 203.5 -84.5t84.5 -203.5v-960q0 -119 -84.5 -203.5t-203.5 -84.5h-960q-119 0 -203.5 84.5t-84.5 203.5v960q0 119 84.5 203.5t203.5 84.5h960zM499 1041h-371v-787h382q117 0 197 57.5t80 170.5q0 158 -143 200q107 52 107 164q0 57 -19.5 96.5
+t-56.5 60.5t-79 29.5t-97 8.5zM477 723h-176v184h163q119 0 119 -90q0 -94 -106 -94zM486 388h-185v217h189q124 0 124 -113q0 -104 -128 -104zM1136 356q-68 0 -104 38t-36 107h411q1 10 1 30q0 132 -74.5 220.5t-203.5 88.5q-128 0 -210 -86t-82 -216q0 -135 79 -217
+t213 -82q205 0 267 191h-138q-11 -34 -47.5 -54t-75.5 -20zM1126 722q113 0 124 -122h-254q4 56 39 89t91 33zM964 988h319v-77h-319v77z" />
+    <glyph glyph-name="_411" unicode="&#xf1b6;" horiz-adv-x="1792" 
+d="M1582 954q0 -101 -71.5 -172.5t-172.5 -71.5t-172.5 71.5t-71.5 172.5t71.5 172.5t172.5 71.5t172.5 -71.5t71.5 -172.5zM812 212q0 104 -73 177t-177 73q-27 0 -54 -6l104 -42q77 -31 109.5 -106.5t1.5 -151.5q-31 -77 -107 -109t-152 -1q-21 8 -62 24.5t-61 24.5
+q32 -60 91 -96.5t130 -36.5q104 0 177 73t73 177zM1642 953q0 126 -89.5 215.5t-215.5 89.5q-127 0 -216.5 -89.5t-89.5 -215.5q0 -127 89.5 -216t216.5 -89q126 0 215.5 89t89.5 216zM1792 953q0 -189 -133.5 -322t-321.5 -133l-437 -319q-12 -129 -109 -218t-229 -89
+q-121 0 -214 76t-118 192l-230 92v429l389 -157q79 48 173 48q13 0 35 -2l284 407q2 187 135.5 319t320.5 132q188 0 321.5 -133.5t133.5 -321.5z" />
+    <glyph glyph-name="_412" unicode="&#xf1b7;" 
+d="M1242 889q0 80 -57 136.5t-137 56.5t-136.5 -57t-56.5 -136q0 -80 56.5 -136.5t136.5 -56.5t137 56.5t57 136.5zM632 301q0 -83 -58 -140.5t-140 -57.5q-56 0 -103 29t-72 77q52 -20 98 -40q60 -24 120 1.5t85 86.5q24 60 -1.5 120t-86.5 84l-82 33q22 5 42 5
+q82 0 140 -57.5t58 -140.5zM1536 1120v-960q0 -119 -84.5 -203.5t-203.5 -84.5h-960q-119 0 -203.5 84.5t-84.5 203.5v153l172 -69q20 -92 93.5 -152t168.5 -60q104 0 181 70t87 173l345 252q150 0 255.5 105.5t105.5 254.5q0 150 -105.5 255.5t-255.5 105.5
+q-148 0 -253 -104.5t-107 -252.5l-225 -322q-9 1 -28 1q-75 0 -137 -37l-297 119v468q0 119 84.5 203.5t203.5 84.5h960q119 0 203.5 -84.5t84.5 -203.5zM1289 887q0 -100 -71 -170.5t-171 -70.5t-170.5 70.5t-70.5 170.5t70.5 171t170.5 71q101 0 171.5 -70.5t70.5 -171.5z
+" />
+    <glyph glyph-name="_413" unicode="&#xf1b8;" horiz-adv-x="1792" 
+d="M836 367l-15 -368l-2 -22l-420 29q-36 3 -67 31.5t-47 65.5q-11 27 -14.5 55t4 65t12 55t21.5 64t19 53q78 -12 509 -28zM449 953l180 -379l-147 92q-63 -72 -111.5 -144.5t-72.5 -125t-39.5 -94.5t-18.5 -63l-4 -21l-190 357q-17 26 -18 56t6 47l8 18q35 63 114 188
+l-140 86zM1680 436l-188 -359q-12 -29 -36.5 -46.5t-43.5 -20.5l-18 -4q-71 -7 -219 -12l8 -164l-230 367l211 362l7 -173q170 -16 283 -5t170 33zM895 1360q-47 -63 -265 -435l-317 187l-19 12l225 356q20 31 60 45t80 10q24 -2 48.5 -12t42 -21t41.5 -33t36 -34.5
+t36 -39.5t32 -35zM1550 1053l212 -363q18 -37 12.5 -76t-27.5 -74q-13 -20 -33 -37t-38 -28t-48.5 -22t-47 -16t-51.5 -14t-46 -12q-34 72 -265 436l313 195zM1407 1279l142 83l-220 -373l-419 20l151 86q-34 89 -75 166t-75.5 123.5t-64.5 80t-47 46.5l-17 13l405 -1
+q31 3 58 -10.5t39 -28.5l11 -15q39 -61 112 -190z" />
+    <glyph glyph-name="_414" unicode="&#xf1b9;" horiz-adv-x="2048" 
+d="M480 448q0 66 -47 113t-113 47t-113 -47t-47 -113t47 -113t113 -47t113 47t47 113zM516 768h1016l-89 357q-2 8 -14 17.5t-21 9.5h-768q-9 0 -21 -9.5t-14 -17.5zM1888 448q0 66 -47 113t-113 47t-113 -47t-47 -113t47 -113t113 -47t113 47t47 113zM2048 544v-384
+q0 -14 -9 -23t-23 -9h-96v-128q0 -80 -56 -136t-136 -56t-136 56t-56 136v128h-1024v-128q0 -80 -56 -136t-136 -56t-136 56t-56 136v128h-96q-14 0 -23 9t-9 23v384q0 93 65.5 158.5t158.5 65.5h28l105 419q23 94 104 157.5t179 63.5h768q98 0 179 -63.5t104 -157.5
+l105 -419h28q93 0 158.5 -65.5t65.5 -158.5z" />
+    <glyph glyph-name="_415" unicode="&#xf1ba;" horiz-adv-x="2048" 
+d="M1824 640q93 0 158.5 -65.5t65.5 -158.5v-384q0 -14 -9 -23t-23 -9h-96v-64q0 -80 -56 -136t-136 -56t-136 56t-56 136v64h-1024v-64q0 -80 -56 -136t-136 -56t-136 56t-56 136v64h-96q-14 0 -23 9t-9 23v384q0 93 65.5 158.5t158.5 65.5h28l105 419q23 94 104 157.5
+t179 63.5h128v224q0 14 9 23t23 9h448q14 0 23 -9t9 -23v-224h128q98 0 179 -63.5t104 -157.5l105 -419h28zM320 160q66 0 113 47t47 113t-47 113t-113 47t-113 -47t-47 -113t47 -113t113 -47zM516 640h1016l-89 357q-2 8 -14 17.5t-21 9.5h-768q-9 0 -21 -9.5t-14 -17.5z
+M1728 160q66 0 113 47t47 113t-47 113t-113 47t-113 -47t-47 -113t47 -113t113 -47z" />
+    <glyph glyph-name="_416" unicode="&#xf1bb;" 
+d="M1504 64q0 -26 -19 -45t-45 -19h-462q1 -17 6 -87.5t5 -108.5q0 -25 -18 -42.5t-43 -17.5h-320q-25 0 -43 17.5t-18 42.5q0 38 5 108.5t6 87.5h-462q-26 0 -45 19t-19 45t19 45l402 403h-229q-26 0 -45 19t-19 45t19 45l402 403h-197q-26 0 -45 19t-19 45t19 45l384 384
+q19 19 45 19t45 -19l384 -384q19 -19 19 -45t-19 -45t-45 -19h-197l402 -403q19 -19 19 -45t-19 -45t-45 -19h-229l402 -403q19 -19 19 -45z" />
+    <glyph glyph-name="_417" unicode="&#xf1bc;" 
+d="M1127 326q0 32 -30 51q-193 115 -447 115q-133 0 -287 -34q-42 -9 -42 -52q0 -20 13.5 -34.5t35.5 -14.5q5 0 37 8q132 27 243 27q226 0 397 -103q19 -11 33 -11q19 0 33 13.5t14 34.5zM1223 541q0 40 -35 61q-237 141 -548 141q-153 0 -303 -42q-48 -13 -48 -64
+q0 -25 17.5 -42.5t42.5 -17.5q7 0 37 8q122 33 251 33q279 0 488 -124q24 -13 38 -13q25 0 42.5 17.5t17.5 42.5zM1331 789q0 47 -40 70q-126 73 -293 110.5t-343 37.5q-204 0 -364 -47q-23 -7 -38.5 -25.5t-15.5 -48.5q0 -31 20.5 -52t51.5 -21q11 0 40 8q133 37 307 37
+q159 0 309.5 -34t253.5 -95q21 -12 40 -12q29 0 50.5 20.5t21.5 51.5zM1536 640q0 -209 -103 -385.5t-279.5 -279.5t-385.5 -103t-385.5 103t-279.5 279.5t-103 385.5t103 385.5t279.5 279.5t385.5 103t385.5 -103t279.5 -279.5t103 -385.5z" />
+    <glyph glyph-name="_418" unicode="&#xf1bd;" horiz-adv-x="1024" 
+d="M1024 1233l-303 -582l24 -31h279v-415h-507l-44 -30l-142 -273l-30 -30h-301v303l303 583l-24 30h-279v415h507l44 30l142 273l30 30h301v-303z" />
+    <glyph glyph-name="_419" unicode="&#xf1be;" horiz-adv-x="2304" 
+d="M784 164l16 241l-16 523q-1 10 -7.5 17t-16.5 7q-9 0 -16 -7t-7 -17l-14 -523l14 -241q1 -10 7.5 -16.5t15.5 -6.5q22 0 24 23zM1080 193l11 211l-12 586q0 16 -13 24q-8 5 -16 5t-16 -5q-13 -8 -13 -24l-1 -6l-10 -579q0 -1 11 -236v-1q0 -10 6 -17q9 -11 23 -11
+q11 0 20 9q9 7 9 20zM35 533l20 -128l-20 -126q-2 -9 -9 -9t-9 9l-17 126l17 128q2 9 9 9t9 -9zM121 612l26 -207l-26 -203q-2 -9 -10 -9q-9 0 -9 10l-23 202l23 207q0 9 9 9q8 0 10 -9zM401 159zM213 650l25 -245l-25 -237q0 -11 -11 -11q-10 0 -12 11l-21 237l21 245
+q2 12 12 12q11 0 11 -12zM307 657l23 -252l-23 -244q-2 -13 -14 -13q-13 0 -13 13l-21 244l21 252q0 13 13 13q12 0 14 -13zM401 639l21 -234l-21 -246q-2 -16 -16 -16q-6 0 -10.5 4.5t-4.5 11.5l-20 246l20 234q0 6 4.5 10.5t10.5 4.5q14 0 16 -15zM784 164zM495 785
+l21 -380l-21 -246q0 -7 -5 -12.5t-12 -5.5q-16 0 -18 18l-18 246l18 380q2 18 18 18q7 0 12 -5.5t5 -12.5zM589 871l19 -468l-19 -244q0 -8 -5.5 -13.5t-13.5 -5.5q-18 0 -20 19l-16 244l16 468q2 19 20 19q8 0 13.5 -5.5t5.5 -13.5zM687 911l18 -506l-18 -242
+q-2 -21 -22 -21q-19 0 -21 21l-16 242l16 506q0 9 6.5 15.5t14.5 6.5q9 0 15 -6.5t7 -15.5zM1079 169v0v0v0zM881 915l15 -510l-15 -239q0 -10 -7.5 -17.5t-17.5 -7.5t-17 7t-8 18l-14 239l14 510q0 11 7.5 18t17.5 7t17.5 -7t7.5 -18zM980 896l14 -492l-14 -236
+q0 -11 -8 -19t-19 -8t-19 8t-9 19l-12 236l12 492q1 12 9 20t19 8t18.5 -8t8.5 -20zM1192 404l-14 -231v0q0 -13 -9 -22t-22 -9t-22 9t-10 22l-6 114l-6 117l12 636v3q2 15 12 24q9 7 20 7q8 0 15 -5q14 -8 16 -26zM2304 423q0 -117 -83 -199.5t-200 -82.5h-786
+q-13 2 -22 11t-9 22v899q0 23 28 33q85 34 181 34q195 0 338 -131.5t160 -323.5q53 22 110 22q117 0 200 -83t83 -201z" />
+    <glyph glyph-name="uniF1C0" unicode="&#xf1c0;" 
+d="M768 768q237 0 443 43t325 127v-170q0 -69 -103 -128t-280 -93.5t-385 -34.5t-385 34.5t-280 93.5t-103 128v170q119 -84 325 -127t443 -43zM768 0q237 0 443 43t325 127v-170q0 -69 -103 -128t-280 -93.5t-385 -34.5t-385 34.5t-280 93.5t-103 128v170q119 -84 325 -127
+t443 -43zM768 384q237 0 443 43t325 127v-170q0 -69 -103 -128t-280 -93.5t-385 -34.5t-385 34.5t-280 93.5t-103 128v170q119 -84 325 -127t443 -43zM768 1536q208 0 385 -34.5t280 -93.5t103 -128v-128q0 -69 -103 -128t-280 -93.5t-385 -34.5t-385 34.5t-280 93.5
+t-103 128v128q0 69 103 128t280 93.5t385 34.5z" />
+    <glyph glyph-name="uniF1C1" unicode="&#xf1c1;" 
+d="M1468 1156q28 -28 48 -76t20 -88v-1152q0 -40 -28 -68t-68 -28h-1344q-40 0 -68 28t-28 68v1600q0 40 28 68t68 28h896q40 0 88 -20t76 -48zM1024 1400v-376h376q-10 29 -22 41l-313 313q-12 12 -41 22zM1408 -128v1024h-416q-40 0 -68 28t-28 68v416h-768v-1536h1280z
+M894 465q33 -26 84 -56q59 7 117 7q147 0 177 -49q16 -22 2 -52q0 -1 -1 -2l-2 -2v-1q-6 -38 -71 -38q-48 0 -115 20t-130 53q-221 -24 -392 -83q-153 -262 -242 -262q-15 0 -28 7l-24 12q-1 1 -6 5q-10 10 -6 36q9 40 56 91.5t132 96.5q14 9 23 -6q2 -2 2 -4q52 85 107 197
+q68 136 104 262q-24 82 -30.5 159.5t6.5 127.5q11 40 42 40h21h1q23 0 35 -15q18 -21 9 -68q-2 -6 -4 -8q1 -3 1 -8v-30q-2 -123 -14 -192q55 -164 146 -238zM318 54q52 24 137 158q-51 -40 -87.5 -84t-49.5 -74zM716 974q-15 -42 -2 -132q1 7 7 44q0 3 7 43q1 4 4 8
+q-1 1 -1 2q-1 2 -1 3q-1 22 -13 36q0 -1 -1 -2v-2zM592 313q135 54 284 81q-2 1 -13 9.5t-16 13.5q-76 67 -127 176q-27 -86 -83 -197q-30 -56 -45 -83zM1238 329q-24 24 -140 24q76 -28 124 -28q14 0 18 1q0 1 -2 3z" />
+    <glyph glyph-name="_422" unicode="&#xf1c2;" 
+d="M1468 1156q28 -28 48 -76t20 -88v-1152q0 -40 -28 -68t-68 -28h-1344q-40 0 -68 28t-28 68v1600q0 40 28 68t68 28h896q40 0 88 -20t76 -48zM1024 1400v-376h376q-10 29 -22 41l-313 313q-12 12 -41 22zM1408 -128v1024h-416q-40 0 -68 28t-28 68v416h-768v-1536h1280z
+M233 768v-107h70l164 -661h159l128 485q7 20 10 46q2 16 2 24h4l3 -24q1 -3 3.5 -20t5.5 -26l128 -485h159l164 661h70v107h-300v-107h90l-99 -438q-5 -20 -7 -46l-2 -21h-4q0 3 -0.5 6.5t-1.5 8t-1 6.5q-1 5 -4 21t-5 25l-144 545h-114l-144 -545q-2 -9 -4.5 -24.5
+t-3.5 -21.5l-4 -21h-4l-2 21q-2 26 -7 46l-99 438h90v107h-300z" />
+    <glyph glyph-name="_423" unicode="&#xf1c3;" 
+d="M1468 1156q28 -28 48 -76t20 -88v-1152q0 -40 -28 -68t-68 -28h-1344q-40 0 -68 28t-28 68v1600q0 40 28 68t68 28h896q40 0 88 -20t76 -48zM1024 1400v-376h376q-10 29 -22 41l-313 313q-12 12 -41 22zM1408 -128v1024h-416q-40 0 -68 28t-28 68v416h-768v-1536h1280z
+M429 106v-106h281v106h-75l103 161q5 7 10 16.5t7.5 13.5t3.5 4h2q1 -4 5 -10q2 -4 4.5 -7.5t6 -8t6.5 -8.5l107 -161h-76v-106h291v106h-68l-192 273l195 282h67v107h-279v-107h74l-103 -159q-4 -7 -10 -16.5t-9 -13.5l-2 -3h-2q-1 4 -5 10q-6 11 -17 23l-106 159h76v107
+h-290v-107h68l189 -272l-194 -283h-68z" />
+    <glyph glyph-name="_424" unicode="&#xf1c4;" 
+d="M1468 1156q28 -28 48 -76t20 -88v-1152q0 -40 -28 -68t-68 -28h-1344q-40 0 -68 28t-28 68v1600q0 40 28 68t68 28h896q40 0 88 -20t76 -48zM1024 1400v-376h376q-10 29 -22 41l-313 313q-12 12 -41 22zM1408 -128v1024h-416q-40 0 -68 28t-28 68v416h-768v-1536h1280z
+M416 106v-106h327v106h-93v167h137q76 0 118 15q67 23 106.5 87t39.5 146q0 81 -37 141t-100 87q-48 19 -130 19h-368v-107h92v-555h-92zM769 386h-119v268h120q52 0 83 -18q56 -33 56 -115q0 -89 -62 -120q-31 -15 -78 -15z" />
+    <glyph glyph-name="_425" unicode="&#xf1c5;" 
+d="M1468 1156q28 -28 48 -76t20 -88v-1152q0 -40 -28 -68t-68 -28h-1344q-40 0 -68 28t-28 68v1600q0 40 28 68t68 28h896q40 0 88 -20t76 -48zM1024 1400v-376h376q-10 29 -22 41l-313 313q-12 12 -41 22zM1408 -128v1024h-416q-40 0 -68 28t-28 68v416h-768v-1536h1280z
+M1280 320v-320h-1024v192l192 192l128 -128l384 384zM448 512q-80 0 -136 56t-56 136t56 136t136 56t136 -56t56 -136t-56 -136t-136 -56z" />
+    <glyph glyph-name="_426" unicode="&#xf1c6;" 
+d="M640 1152v128h-128v-128h128zM768 1024v128h-128v-128h128zM640 896v128h-128v-128h128zM768 768v128h-128v-128h128zM1468 1156q28 -28 48 -76t20 -88v-1152q0 -40 -28 -68t-68 -28h-1344q-40 0 -68 28t-28 68v1600q0 40 28 68t68 28h896q40 0 88 -20t76 -48zM1024 1400
+v-376h376q-10 29 -22 41l-313 313q-12 12 -41 22zM1408 -128v1024h-416q-40 0 -68 28t-28 68v416h-128v-128h-128v128h-512v-1536h1280zM781 593l107 -349q8 -27 8 -52q0 -83 -72.5 -137.5t-183.5 -54.5t-183.5 54.5t-72.5 137.5q0 25 8 52q21 63 120 396v128h128v-128h79
+q22 0 39 -13t23 -34zM640 128q53 0 90.5 19t37.5 45t-37.5 45t-90.5 19t-90.5 -19t-37.5 -45t37.5 -45t90.5 -19z" />
+    <glyph glyph-name="_427" unicode="&#xf1c7;" 
+d="M1468 1156q28 -28 48 -76t20 -88v-1152q0 -40 -28 -68t-68 -28h-1344q-40 0 -68 28t-28 68v1600q0 40 28 68t68 28h896q40 0 88 -20t76 -48zM1024 1400v-376h376q-10 29 -22 41l-313 313q-12 12 -41 22zM1408 -128v1024h-416q-40 0 -68 28t-28 68v416h-768v-1536h1280z
+M620 686q20 -8 20 -30v-544q0 -22 -20 -30q-8 -2 -12 -2q-12 0 -23 9l-166 167h-131q-14 0 -23 9t-9 23v192q0 14 9 23t23 9h131l166 167q16 15 35 7zM1037 -3q31 0 50 24q129 159 129 363t-129 363q-16 21 -43 24t-47 -14q-21 -17 -23.5 -43.5t14.5 -47.5
+q100 -123 100 -282t-100 -282q-17 -21 -14.5 -47.5t23.5 -42.5q18 -15 40 -15zM826 145q27 0 47 20q87 93 87 219t-87 219q-18 19 -45 20t-46 -17t-20 -44.5t18 -46.5q52 -57 52 -131t-52 -131q-19 -20 -18 -46.5t20 -44.5q20 -17 44 -17z" />
+    <glyph glyph-name="_428" unicode="&#xf1c8;" 
+d="M1468 1156q28 -28 48 -76t20 -88v-1152q0 -40 -28 -68t-68 -28h-1344q-40 0 -68 28t-28 68v1600q0 40 28 68t68 28h896q40 0 88 -20t76 -48zM1024 1400v-376h376q-10 29 -22 41l-313 313q-12 12 -41 22zM1408 -128v1024h-416q-40 0 -68 28t-28 68v416h-768v-1536h1280z
+M768 768q52 0 90 -38t38 -90v-384q0 -52 -38 -90t-90 -38h-384q-52 0 -90 38t-38 90v384q0 52 38 90t90 38h384zM1260 766q20 -8 20 -30v-576q0 -22 -20 -30q-8 -2 -12 -2q-14 0 -23 9l-265 266v90l265 266q9 9 23 9q4 0 12 -2z" />
+    <glyph glyph-name="_429" unicode="&#xf1c9;" 
+d="M1468 1156q28 -28 48 -76t20 -88v-1152q0 -40 -28 -68t-68 -28h-1344q-40 0 -68 28t-28 68v1600q0 40 28 68t68 28h896q40 0 88 -20t76 -48zM1024 1400v-376h376q-10 29 -22 41l-313 313q-12 12 -41 22zM1408 -128v1024h-416q-40 0 -68 28t-28 68v416h-768v-1536h1280z
+M480 768q8 11 21 12.5t24 -6.5l51 -38q11 -8 12.5 -21t-6.5 -24l-182 -243l182 -243q8 -11 6.5 -24t-12.5 -21l-51 -38q-11 -8 -24 -6.5t-21 12.5l-226 301q-14 19 0 38zM1282 467q14 -19 0 -38l-226 -301q-8 -11 -21 -12.5t-24 6.5l-51 38q-11 8 -12.5 21t6.5 24l182 243
+l-182 243q-8 11 -6.5 24t12.5 21l51 38q11 8 24 6.5t21 -12.5zM662 6q-13 2 -20.5 13t-5.5 24l138 831q2 13 13 20.5t24 5.5l63 -10q13 -2 20.5 -13t5.5 -24l-138 -831q-2 -13 -13 -20.5t-24 -5.5z" />
+    <glyph glyph-name="_430" unicode="&#xf1ca;" 
+d="M1497 709v-198q-101 -23 -198 -23q-65 -136 -165.5 -271t-181.5 -215.5t-128 -106.5q-80 -45 -162 3q-28 17 -60.5 43.5t-85 83.5t-102.5 128.5t-107.5 184t-105.5 244t-91.5 314.5t-70.5 390h283q26 -218 70 -398.5t104.5 -317t121.5 -235.5t140 -195q169 169 287 406
+q-142 72 -223 220t-81 333q0 192 104 314.5t284 122.5q178 0 273 -105.5t95 -297.5q0 -159 -58 -286q-7 -1 -19.5 -3t-46 -2t-63 6t-62 25.5t-50.5 51.5q31 103 31 184q0 87 -29 132t-79 45q-53 0 -85 -49.5t-32 -140.5q0 -186 105 -293.5t267 -107.5q62 0 121 14z" />
+    <glyph glyph-name="_431" unicode="&#xf1cb;" horiz-adv-x="1792" 
+d="M216 367l603 -402v359l-334 223zM154 511l193 129l-193 129v-258zM973 -35l603 402l-269 180l-334 -223v-359zM896 458l272 182l-272 182l-272 -182zM485 733l334 223v359l-603 -402zM1445 640l193 -129v258zM1307 733l269 180l-603 402v-359zM1792 913v-546
+q0 -41 -34 -64l-819 -546q-21 -13 -43 -13t-43 13l-819 546q-34 23 -34 64v546q0 41 34 64l819 546q21 13 43 13t43 -13l819 -546q34 -23 34 -64z" />
+    <glyph glyph-name="_432" unicode="&#xf1cc;" horiz-adv-x="2048" 
+d="M1800 764q111 -46 179.5 -145.5t68.5 -221.5q0 -164 -118 -280.5t-285 -116.5q-4 0 -11.5 0.5t-10.5 0.5h-1209h-1h-2h-5q-170 10 -288 125.5t-118 280.5q0 110 55 203t147 147q-12 39 -12 82q0 115 82 196t199 81q95 0 172 -58q75 154 222.5 248t326.5 94
+q166 0 306 -80.5t221.5 -218.5t81.5 -301q0 -6 -0.5 -18t-0.5 -18zM468 498q0 -122 84 -193t208 -71q137 0 240 99q-16 20 -47.5 56.5t-43.5 50.5q-67 -65 -144 -65q-55 0 -93.5 33.5t-38.5 87.5q0 53 38.5 87t91.5 34q44 0 84.5 -21t73 -55t65 -75t69 -82t77 -75t97 -55
+t121.5 -21q121 0 204.5 71.5t83.5 190.5q0 121 -84 192t-207 71q-143 0 -241 -97l93 -108q66 64 142 64q52 0 92 -33t40 -84q0 -57 -37 -91.5t-94 -34.5q-43 0 -82.5 21t-72 55t-65.5 75t-69.5 82t-77.5 75t-96.5 55t-118.5 21q-122 0 -207 -70.5t-85 -189.5z" />
+    <glyph glyph-name="_433" unicode="&#xf1cd;" horiz-adv-x="1792" 
+d="M896 1536q182 0 348 -71t286 -191t191 -286t71 -348t-71 -348t-191 -286t-286 -191t-348 -71t-348 71t-286 191t-191 286t-71 348t71 348t191 286t286 191t348 71zM896 1408q-190 0 -361 -90l194 -194q82 28 167 28t167 -28l194 194q-171 90 -361 90zM218 279l194 194
+q-28 82 -28 167t28 167l-194 194q-90 -171 -90 -361t90 -361zM896 -128q190 0 361 90l-194 194q-82 -28 -167 -28t-167 28l-194 -194q171 -90 361 -90zM896 256q159 0 271.5 112.5t112.5 271.5t-112.5 271.5t-271.5 112.5t-271.5 -112.5t-112.5 -271.5t112.5 -271.5
+t271.5 -112.5zM1380 473l194 -194q90 171 90 361t-90 361l-194 -194q28 -82 28 -167t-28 -167z" />
+    <glyph glyph-name="_434" unicode="&#xf1ce;" horiz-adv-x="1792" 
+d="M1760 640q0 -176 -68.5 -336t-184 -275.5t-275.5 -184t-336 -68.5t-336 68.5t-275.5 184t-184 275.5t-68.5 336q0 213 97 398.5t265 305.5t374 151v-228q-221 -45 -366.5 -221t-145.5 -406q0 -130 51 -248.5t136.5 -204t204 -136.5t248.5 -51t248.5 51t204 136.5
+t136.5 204t51 248.5q0 230 -145.5 406t-366.5 221v228q206 -31 374 -151t265 -305.5t97 -398.5z" />
+    <glyph glyph-name="uniF1D0" unicode="&#xf1d0;" horiz-adv-x="1792" 
+d="M19 662q8 217 116 406t305 318h5q0 -1 -1 -3q-8 -8 -28 -33.5t-52 -76.5t-60 -110.5t-44.5 -135.5t-14 -150.5t39 -157.5t108.5 -154q50 -50 102 -69.5t90.5 -11.5t69.5 23.5t47 32.5l16 16q39 51 53 116.5t6.5 122.5t-21 107t-26.5 80l-14 29q-10 25 -30.5 49.5t-43 41
+t-43.5 29.5t-35 19l-13 6l104 115q39 -17 78 -52t59 -61l19 -27q1 48 -18.5 103.5t-40.5 87.5l-20 31l161 183l160 -181q-33 -46 -52.5 -102.5t-22.5 -90.5l-4 -33q22 37 61.5 72.5t67.5 52.5l28 17l103 -115q-44 -14 -85 -50t-60 -65l-19 -29q-31 -56 -48 -133.5t-7 -170
+t57 -156.5q33 -45 77.5 -60.5t85 -5.5t76 26.5t57.5 33.5l21 16q60 53 96.5 115t48.5 121.5t10 121.5t-18 118t-37 107.5t-45.5 93t-45 72t-34.5 47.5l-13 17q-14 13 -7 13l10 -3q40 -29 62.5 -46t62 -50t64 -58t58.5 -65t55.5 -77t45.5 -88t38 -103t23.5 -117t10.5 -136
+q3 -259 -108 -465t-312 -321t-456 -115q-185 0 -351 74t-283.5 198t-184 293t-60.5 353z" />
+    <glyph glyph-name="uniF1D1" unicode="&#xf1d1;" horiz-adv-x="1792" 
+d="M874 -102v-66q-208 6 -385 109.5t-283 275.5l58 34q29 -49 73 -99l65 57q148 -168 368 -212l-17 -86q65 -12 121 -13zM276 428l-83 -28q22 -60 49 -112l-57 -33q-98 180 -98 385t98 385l57 -33q-30 -56 -49 -112l82 -28q-35 -100 -35 -212q0 -109 36 -212zM1528 251
+l58 -34q-106 -172 -283 -275.5t-385 -109.5v66q56 1 121 13l-17 86q220 44 368 212l65 -57q44 50 73 99zM1377 805l-233 -80q14 -42 14 -85t-14 -85l232 -80q-31 -92 -98 -169l-185 162q-57 -67 -147 -85l48 -241q-52 -10 -98 -10t-98 10l48 241q-90 18 -147 85l-185 -162
+q-67 77 -98 169l232 80q-14 42 -14 85t14 85l-233 80q33 93 99 169l185 -162q59 68 147 86l-48 240q44 10 98 10t98 -10l-48 -240q88 -18 147 -86l185 162q66 -76 99 -169zM874 1448v-66q-65 -2 -121 -13l17 -86q-220 -42 -368 -211l-65 56q-38 -42 -73 -98l-57 33
+q106 172 282 275.5t385 109.5zM1705 640q0 -205 -98 -385l-57 33q27 52 49 112l-83 28q36 103 36 212q0 112 -35 212l82 28q-19 56 -49 112l57 33q98 -180 98 -385zM1585 1063l-57 -33q-35 56 -73 98l-65 -56q-148 169 -368 211l17 86q-56 11 -121 13v66q209 -6 385 -109.5
+t282 -275.5zM1748 640q0 173 -67.5 331t-181.5 272t-272 181.5t-331 67.5t-331 -67.5t-272 -181.5t-181.5 -272t-67.5 -331t67.5 -331t181.5 -272t272 -181.5t331 -67.5t331 67.5t272 181.5t181.5 272t67.5 331zM1792 640q0 -182 -71 -348t-191 -286t-286 -191t-348 -71
+t-348 71t-286 191t-191 286t-71 348t71 348t191 286t286 191t348 71t348 -71t286 -191t191 -286t71 -348z" />
+    <glyph glyph-name="uniF1D2" unicode="&#xf1d2;" 
+d="M582 228q0 -66 -93 -66q-107 0 -107 63q0 64 98 64q102 0 102 -61zM546 694q0 -85 -74 -85q-77 0 -77 84q0 90 77 90q36 0 55 -25.5t19 -63.5zM712 769v125q-78 -29 -135 -29q-50 29 -110 29q-86 0 -145 -57t-59 -143q0 -50 29.5 -102t73.5 -67v-3q-38 -17 -38 -85
+q0 -53 41 -77v-3q-113 -37 -113 -139q0 -45 20 -78.5t54 -51t72 -25.5t81 -8q224 0 224 188q0 67 -48 99t-126 46q-27 5 -51.5 20.5t-24.5 39.5q0 44 49 52q77 15 122 70t45 134q0 24 -10 52q37 9 49 13zM771 350h137q-2 27 -2 82v387q0 46 2 69h-137q3 -23 3 -71v-392
+q0 -50 -3 -75zM1280 366v121q-30 -21 -68 -21q-53 0 -53 82v225h52q9 0 26.5 -1t26.5 -1v117h-105q0 82 3 102h-140q4 -24 4 -55v-47h-60v-117q36 3 37 3q3 0 11 -0.5t12 -0.5v-2h-2v-217q0 -37 2.5 -64t11.5 -56.5t24.5 -48.5t43.5 -31t66 -12q64 0 108 24zM924 1072
+q0 36 -24 63.5t-60 27.5t-60.5 -27t-24.5 -64q0 -36 25 -62.5t60 -26.5t59.5 27t24.5 62zM1536 1120v-960q0 -119 -84.5 -203.5t-203.5 -84.5h-960q-119 0 -203.5 84.5t-84.5 203.5v960q0 119 84.5 203.5t203.5 84.5h960q119 0 203.5 -84.5t84.5 -203.5z" />
+    <glyph glyph-name="_438" unicode="&#xf1d3;" horiz-adv-x="1792" 
+d="M595 22q0 100 -165 100q-158 0 -158 -104q0 -101 172 -101q151 0 151 105zM536 777q0 61 -30 102t-89 41q-124 0 -124 -145q0 -135 124 -135q119 0 119 137zM805 1101v-202q-36 -12 -79 -22q16 -43 16 -84q0 -127 -73 -216.5t-197 -112.5q-40 -8 -59.5 -27t-19.5 -58
+q0 -31 22.5 -51.5t58 -32t78.5 -22t86 -25.5t78.5 -37.5t58 -64t22.5 -98.5q0 -304 -363 -304q-69 0 -130 12.5t-116 41t-87.5 82t-32.5 127.5q0 165 182 225v4q-67 41 -67 126q0 109 63 137v4q-72 24 -119.5 108.5t-47.5 165.5q0 139 95 231.5t235 92.5q96 0 178 -47
+q98 0 218 47zM1123 220h-222q4 45 4 134v609q0 94 -4 128h222q-4 -33 -4 -124v-613q0 -89 4 -134zM1724 442v-196q-71 -39 -174 -39q-62 0 -107 20t-70 50t-39.5 78t-18.5 92t-4 103v351h2v4q-7 0 -19 1t-18 1q-21 0 -59 -6v190h96v76q0 54 -6 89h227q-6 -41 -6 -165h171
+v-190q-15 0 -43.5 2t-42.5 2h-85v-365q0 -131 87 -131q61 0 109 33zM1148 1389q0 -58 -39 -101.5t-96 -43.5q-58 0 -98 43.5t-40 101.5q0 59 39.5 103t98.5 44q58 0 96.5 -44.5t38.5 -102.5z" />
+    <glyph glyph-name="_439" unicode="&#xf1d4;" 
+d="M809 532l266 499h-112l-157 -312q-24 -48 -44 -92l-42 92l-155 312h-120l263 -493v-324h101v318zM1536 1120v-960q0 -119 -84.5 -203.5t-203.5 -84.5h-960q-119 0 -203.5 84.5t-84.5 203.5v960q0 119 84.5 203.5t203.5 84.5h960q119 0 203.5 -84.5t84.5 -203.5z" />
+    <glyph glyph-name="uniF1D5" unicode="&#xf1d5;" horiz-adv-x="1280" 
+d="M842 964q0 -80 -57 -136.5t-136 -56.5q-60 0 -111 35q-62 -67 -115 -146q-247 -371 -202 -859q1 -22 -12.5 -38.5t-34.5 -18.5h-5q-20 0 -35 13.5t-17 33.5q-14 126 -3.5 247.5t29.5 217t54 186t69 155.5t74 125q61 90 132 165q-16 35 -16 77q0 80 56.5 136.5t136.5 56.5
+t136.5 -56.5t56.5 -136.5zM1223 953q0 -158 -78 -292t-212.5 -212t-292.5 -78q-64 0 -131 14q-21 5 -32.5 23.5t-6.5 39.5q5 20 23 31.5t39 7.5q51 -13 108 -13q97 0 186 38t153 102t102 153t38 186t-38 186t-102 153t-153 102t-186 38t-186 -38t-153 -102t-102 -153
+t-38 -186q0 -114 52 -218q10 -20 3.5 -40t-25.5 -30t-39.5 -3t-30.5 26q-64 123 -64 265q0 119 46.5 227t124.5 186t186 124t226 46q158 0 292.5 -78t212.5 -212.5t78 -292.5z" />
+    <glyph glyph-name="uniF1D6" unicode="&#xf1d6;" horiz-adv-x="1792" 
+d="M270 730q-8 19 -8 52q0 20 11 49t24 45q-1 22 7.5 53t22.5 43q0 139 92.5 288.5t217.5 209.5q139 66 324 66q133 0 266 -55q49 -21 90 -48t71 -56t55 -68t42 -74t32.5 -84.5t25.5 -89.5t22 -98l1 -5q55 -83 55 -150q0 -14 -9 -40t-9 -38q0 -1 1.5 -3.5t3.5 -5t2 -3.5
+q77 -114 120.5 -214.5t43.5 -208.5q0 -43 -19.5 -100t-55.5 -57q-9 0 -19.5 7.5t-19 17.5t-19 26t-16 26.5t-13.5 26t-9 17.5q-1 1 -3 1l-5 -4q-59 -154 -132 -223q20 -20 61.5 -38.5t69 -41.5t35.5 -65q-2 -4 -4 -16t-7 -18q-64 -97 -302 -97q-53 0 -110.5 9t-98 20
+t-104.5 30q-15 5 -23 7q-14 4 -46 4.5t-40 1.5q-41 -45 -127.5 -65t-168.5 -20q-35 0 -69 1.5t-93 9t-101 20.5t-74.5 40t-32.5 64q0 40 10 59.5t41 48.5q11 2 40.5 13t49.5 12q4 0 14 2q2 2 2 4l-2 3q-48 11 -108 105.5t-73 156.5l-5 3q-4 0 -12 -20q-18 -41 -54.5 -74.5
+t-77.5 -37.5h-1q-4 0 -6 4.5t-5 5.5q-23 54 -23 100q0 275 252 466z" />
+    <glyph glyph-name="uniF1D7" unicode="&#xf1d7;" horiz-adv-x="2048" 
+d="M580 1075q0 41 -25 66t-66 25q-43 0 -76 -25.5t-33 -65.5q0 -39 33 -64.5t76 -25.5q41 0 66 24.5t25 65.5zM1323 568q0 28 -25.5 50t-65.5 22q-27 0 -49.5 -22.5t-22.5 -49.5q0 -28 22.5 -50.5t49.5 -22.5q40 0 65.5 22t25.5 51zM1087 1075q0 41 -24.5 66t-65.5 25
+q-43 0 -76 -25.5t-33 -65.5q0 -39 33 -64.5t76 -25.5q41 0 65.5 24.5t24.5 65.5zM1722 568q0 28 -26 50t-65 22q-27 0 -49.5 -22.5t-22.5 -49.5q0 -28 22.5 -50.5t49.5 -22.5q39 0 65 22t26 51zM1456 965q-31 4 -70 4q-169 0 -311 -77t-223.5 -208.5t-81.5 -287.5
+q0 -78 23 -152q-35 -3 -68 -3q-26 0 -50 1.5t-55 6.5t-44.5 7t-54.5 10.5t-50 10.5l-253 -127l72 218q-290 203 -290 490q0 169 97.5 311t264 223.5t363.5 81.5q176 0 332.5 -66t262 -182.5t136.5 -260.5zM2048 404q0 -117 -68.5 -223.5t-185.5 -193.5l55 -181l-199 109
+q-150 -37 -218 -37q-169 0 -311 70.5t-223.5 191.5t-81.5 264t81.5 264t223.5 191.5t311 70.5q161 0 303 -70.5t227.5 -192t85.5 -263.5z" />
+    <glyph glyph-name="_443" unicode="&#xf1d8;" horiz-adv-x="1792" 
+d="M1764 1525q33 -24 27 -64l-256 -1536q-5 -29 -32 -45q-14 -8 -31 -8q-11 0 -24 5l-453 185l-242 -295q-18 -23 -49 -23q-13 0 -22 4q-19 7 -30.5 23.5t-11.5 36.5v349l864 1059l-1069 -925l-395 162q-37 14 -40 55q-2 40 32 59l1664 960q15 9 32 9q20 0 36 -11z" />
+    <glyph glyph-name="_444" unicode="&#xf1d9;" horiz-adv-x="1792" 
+d="M1764 1525q33 -24 27 -64l-256 -1536q-5 -29 -32 -45q-14 -8 -31 -8q-11 0 -24 5l-527 215l-298 -327q-18 -21 -47 -21q-14 0 -23 4q-19 7 -30 23.5t-11 36.5v452l-472 193q-37 14 -40 55q-3 39 32 59l1664 960q35 21 68 -2zM1422 26l221 1323l-1434 -827l336 -137
+l863 639l-478 -797z" />
+    <glyph glyph-name="_445" unicode="&#xf1da;" 
+d="M1536 640q0 -156 -61 -298t-164 -245t-245 -164t-298 -61q-172 0 -327 72.5t-264 204.5q-7 10 -6.5 22.5t8.5 20.5l137 138q10 9 25 9q16 -2 23 -12q73 -95 179 -147t225 -52q104 0 198.5 40.5t163.5 109.5t109.5 163.5t40.5 198.5t-40.5 198.5t-109.5 163.5
+t-163.5 109.5t-198.5 40.5q-98 0 -188 -35.5t-160 -101.5l137 -138q31 -30 14 -69q-17 -40 -59 -40h-448q-26 0 -45 19t-19 45v448q0 42 40 59q39 17 69 -14l130 -129q107 101 244.5 156.5t284.5 55.5q156 0 298 -61t245 -164t164 -245t61 -298zM896 928v-448q0 -14 -9 -23
+t-23 -9h-320q-14 0 -23 9t-9 23v64q0 14 9 23t23 9h224v352q0 14 9 23t23 9h64q14 0 23 -9t9 -23z" />
+    <glyph glyph-name="_446" unicode="&#xf1db;" 
+d="M768 1280q-130 0 -248.5 -51t-204 -136.5t-136.5 -204t-51 -248.5t51 -248.5t136.5 -204t204 -136.5t248.5 -51t248.5 51t204 136.5t136.5 204t51 248.5t-51 248.5t-136.5 204t-204 136.5t-248.5 51zM1536 640q0 -209 -103 -385.5t-279.5 -279.5t-385.5 -103t-385.5 103
+t-279.5 279.5t-103 385.5t103 385.5t279.5 279.5t385.5 103t385.5 -103t279.5 -279.5t103 -385.5z" />
+    <glyph glyph-name="_447" unicode="&#xf1dc;" horiz-adv-x="1792" 
+d="M1682 -128q-44 0 -132.5 3.5t-133.5 3.5q-44 0 -132 -3.5t-132 -3.5q-24 0 -37 20.5t-13 45.5q0 31 17 46t39 17t51 7t45 15q33 21 33 140l-1 391q0 21 -1 31q-13 4 -50 4h-675q-38 0 -51 -4q-1 -10 -1 -31l-1 -371q0 -142 37 -164q16 -10 48 -13t57 -3.5t45 -15
+t20 -45.5q0 -26 -12.5 -48t-36.5 -22q-47 0 -139.5 3.5t-138.5 3.5q-43 0 -128 -3.5t-127 -3.5q-23 0 -35.5 21t-12.5 45q0 30 15.5 45t36 17.5t47.5 7.5t42 15q33 23 33 143l-1 57v813q0 3 0.5 26t0 36.5t-1.5 38.5t-3.5 42t-6.5 36.5t-11 31.5t-16 18q-15 10 -45 12t-53 2
+t-41 14t-18 45q0 26 12 48t36 22q46 0 138.5 -3.5t138.5 -3.5q42 0 126.5 3.5t126.5 3.5q25 0 37.5 -22t12.5 -48q0 -30 -17 -43.5t-38.5 -14.5t-49.5 -4t-43 -13q-35 -21 -35 -160l1 -320q0 -21 1 -32q13 -3 39 -3h699q25 0 38 3q1 11 1 32l1 320q0 139 -35 160
+q-18 11 -58.5 12.5t-66 13t-25.5 49.5q0 26 12.5 48t37.5 22q44 0 132 -3.5t132 -3.5q43 0 129 3.5t129 3.5q25 0 37.5 -22t12.5 -48q0 -30 -17.5 -44t-40 -14.5t-51.5 -3t-44 -12.5q-35 -23 -35 -161l1 -943q0 -119 34 -140q16 -10 46 -13.5t53.5 -4.5t41.5 -15.5t18 -44.5
+q0 -26 -12 -48t-36 -22z" />
+    <glyph glyph-name="_448" unicode="&#xf1dd;" horiz-adv-x="1280" 
+d="M1278 1347v-73q0 -29 -18.5 -61t-42.5 -32q-50 0 -54 -1q-26 -6 -32 -31q-3 -11 -3 -64v-1152q0 -25 -18 -43t-43 -18h-108q-25 0 -43 18t-18 43v1218h-143v-1218q0 -25 -17.5 -43t-43.5 -18h-108q-26 0 -43.5 18t-17.5 43v496q-147 12 -245 59q-126 58 -192 179
+q-64 117 -64 259q0 166 88 286q88 118 209 159q111 37 417 37h479q25 0 43 -18t18 -43z" />
+    <glyph glyph-name="_449" unicode="&#xf1de;" 
+d="M352 128v-128h-352v128h352zM704 256q26 0 45 -19t19 -45v-256q0 -26 -19 -45t-45 -19h-256q-26 0 -45 19t-19 45v256q0 26 19 45t45 19h256zM864 640v-128h-864v128h864zM224 1152v-128h-224v128h224zM1536 128v-128h-736v128h736zM576 1280q26 0 45 -19t19 -45v-256
+q0 -26 -19 -45t-45 -19h-256q-26 0 -45 19t-19 45v256q0 26 19 45t45 19h256zM1216 768q26 0 45 -19t19 -45v-256q0 -26 -19 -45t-45 -19h-256q-26 0 -45 19t-19 45v256q0 26 19 45t45 19h256zM1536 640v-128h-224v128h224zM1536 1152v-128h-864v128h864z" />
+    <glyph glyph-name="uniF1E0" unicode="&#xf1e0;" 
+d="M1216 512q133 0 226.5 -93.5t93.5 -226.5t-93.5 -226.5t-226.5 -93.5t-226.5 93.5t-93.5 226.5q0 12 2 34l-360 180q-92 -86 -218 -86q-133 0 -226.5 93.5t-93.5 226.5t93.5 226.5t226.5 93.5q126 0 218 -86l360 180q-2 22 -2 34q0 133 93.5 226.5t226.5 93.5
+t226.5 -93.5t93.5 -226.5t-93.5 -226.5t-226.5 -93.5q-126 0 -218 86l-360 -180q2 -22 2 -34t-2 -34l360 -180q92 86 218 86z" />
+    <glyph glyph-name="_451" unicode="&#xf1e1;" 
+d="M1280 341q0 88 -62.5 151t-150.5 63q-84 0 -145 -58l-241 120q2 16 2 23t-2 23l241 120q61 -58 145 -58q88 0 150.5 63t62.5 151t-62.5 150.5t-150.5 62.5t-151 -62.5t-63 -150.5q0 -7 2 -23l-241 -120q-62 57 -145 57q-88 0 -150.5 -62.5t-62.5 -150.5t62.5 -150.5
+t150.5 -62.5q83 0 145 57l241 -120q-2 -16 -2 -23q0 -88 63 -150.5t151 -62.5t150.5 62.5t62.5 150.5zM1536 1120v-960q0 -119 -84.5 -203.5t-203.5 -84.5h-960q-119 0 -203.5 84.5t-84.5 203.5v960q0 119 84.5 203.5t203.5 84.5h960q119 0 203.5 -84.5t84.5 -203.5z" />
+    <glyph glyph-name="_452" unicode="&#xf1e2;" horiz-adv-x="1792" 
+d="M571 947q-10 25 -34 35t-49 0q-108 -44 -191 -127t-127 -191q-10 -25 0 -49t35 -34q13 -5 24 -5q42 0 60 40q34 84 98.5 148.5t148.5 98.5q25 11 35 35t0 49zM1513 1303l46 -46l-244 -243l68 -68q19 -19 19 -45.5t-19 -45.5l-64 -64q89 -161 89 -343q0 -143 -55.5 -273.5
+t-150 -225t-225 -150t-273.5 -55.5t-273.5 55.5t-225 150t-150 225t-55.5 273.5t55.5 273.5t150 225t225 150t273.5 55.5q182 0 343 -89l64 64q19 19 45.5 19t45.5 -19l68 -68zM1521 1359q-10 -10 -22 -10q-13 0 -23 10l-91 90q-9 10 -9 23t9 23q10 9 23 9t23 -9l90 -91
+q10 -9 10 -22.5t-10 -22.5zM1751 1129q-11 -9 -23 -9t-23 9l-90 91q-10 9 -10 22.5t10 22.5q9 10 22.5 10t22.5 -10l91 -90q9 -10 9 -23t-9 -23zM1792 1312q0 -14 -9 -23t-23 -9h-96q-14 0 -23 9t-9 23t9 23t23 9h96q14 0 23 -9t9 -23zM1600 1504v-96q0 -14 -9 -23t-23 -9
+t-23 9t-9 23v96q0 14 9 23t23 9t23 -9t9 -23zM1751 1449l-91 -90q-10 -10 -22 -10q-13 0 -23 10q-10 9 -10 22.5t10 22.5l90 91q10 9 23 9t23 -9q9 -10 9 -23t-9 -23z" />
+    <glyph glyph-name="_453" unicode="&#xf1e3;" horiz-adv-x="1792" 
+d="M609 720l287 208l287 -208l-109 -336h-355zM896 1536q182 0 348 -71t286 -191t191 -286t71 -348t-71 -348t-191 -286t-286 -191t-348 -71t-348 71t-286 191t-191 286t-71 348t71 348t191 286t286 191t348 71zM1515 186q149 203 149 454v3l-102 -89l-240 224l63 323
+l134 -12q-150 206 -389 282l53 -124l-287 -159l-287 159l53 124q-239 -76 -389 -282l135 12l62 -323l-240 -224l-102 89v-3q0 -251 149 -454l30 132l326 -40l139 -298l-116 -69q117 -39 240 -39t240 39l-116 69l139 298l326 40z" />
+    <glyph glyph-name="_454" unicode="&#xf1e4;" horiz-adv-x="1792" 
+d="M448 224v-192q0 -14 -9 -23t-23 -9h-192q-14 0 -23 9t-9 23v192q0 14 9 23t23 9h192q14 0 23 -9t9 -23zM256 608v-192q0 -14 -9 -23t-23 -9h-192q-14 0 -23 9t-9 23v192q0 14 9 23t23 9h192q14 0 23 -9t9 -23zM832 224v-192q0 -14 -9 -23t-23 -9h-192q-14 0 -23 9t-9 23
+v192q0 14 9 23t23 9h192q14 0 23 -9t9 -23zM640 608v-192q0 -14 -9 -23t-23 -9h-192q-14 0 -23 9t-9 23v192q0 14 9 23t23 9h192q14 0 23 -9t9 -23zM66 768q-28 0 -47 19t-19 46v129h514v-129q0 -27 -19 -46t-46 -19h-383zM1216 224v-192q0 -14 -9 -23t-23 -9h-192
+q-14 0 -23 9t-9 23v192q0 14 9 23t23 9h192q14 0 23 -9t9 -23zM1024 608v-192q0 -14 -9 -23t-23 -9h-192q-14 0 -23 9t-9 23v192q0 14 9 23t23 9h192q14 0 23 -9t9 -23zM1600 224v-192q0 -14 -9 -23t-23 -9h-192q-14 0 -23 9t-9 23v192q0 14 9 23t23 9h192q14 0 23 -9t9 -23
+zM1408 608v-192q0 -14 -9 -23t-23 -9h-192q-14 0 -23 9t-9 23v192q0 14 9 23t23 9h192q14 0 23 -9t9 -23zM1792 1016v-13h-514v10q0 104 -382 102q-382 -1 -382 -102v-10h-514v13q0 17 8.5 43t34 64t65.5 75.5t110.5 76t160 67.5t224 47.5t293.5 18.5t293 -18.5t224 -47.5
+t160.5 -67.5t110.5 -76t65.5 -75.5t34 -64t8.5 -43zM1792 608v-192q0 -14 -9 -23t-23 -9h-192q-14 0 -23 9t-9 23v192q0 14 9 23t23 9h192q14 0 23 -9t9 -23zM1792 962v-129q0 -27 -19 -46t-46 -19h-384q-27 0 -46 19t-19 46v129h514z" />
+    <glyph glyph-name="_455" unicode="&#xf1e5;" horiz-adv-x="1792" 
+d="M704 1216v-768q0 -26 -19 -45t-45 -19v-576q0 -26 -19 -45t-45 -19h-512q-26 0 -45 19t-19 45v512l249 873q7 23 31 23h424zM1024 1216v-704h-256v704h256zM1792 320v-512q0 -26 -19 -45t-45 -19h-512q-26 0 -45 19t-19 45v576q-26 0 -45 19t-19 45v768h424q24 0 31 -23z
+M736 1504v-224h-352v224q0 14 9 23t23 9h288q14 0 23 -9t9 -23zM1408 1504v-224h-352v224q0 14 9 23t23 9h288q14 0 23 -9t9 -23z" />
+    <glyph glyph-name="_456" unicode="&#xf1e6;" horiz-adv-x="1792" 
+d="M1755 1083q37 -38 37 -90.5t-37 -90.5l-401 -400l150 -150l-160 -160q-163 -163 -389.5 -186.5t-411.5 100.5l-362 -362h-181v181l362 362q-124 185 -100.5 411.5t186.5 389.5l160 160l150 -150l400 401q38 37 91 37t90 -37t37 -90.5t-37 -90.5l-400 -401l234 -234
+l401 400q38 37 91 37t90 -37z" />
+    <glyph glyph-name="_457" unicode="&#xf1e7;" horiz-adv-x="1792" 
+d="M873 796q0 -83 -63.5 -142.5t-152.5 -59.5t-152.5 59.5t-63.5 142.5q0 84 63.5 143t152.5 59t152.5 -59t63.5 -143zM1375 796q0 -83 -63 -142.5t-153 -59.5q-89 0 -152.5 59.5t-63.5 142.5q0 84 63.5 143t152.5 59q90 0 153 -59t63 -143zM1600 616v667q0 87 -32 123.5
+t-111 36.5h-1112q-83 0 -112.5 -34t-29.5 -126v-673q43 -23 88.5 -40t81 -28t81 -18.5t71 -11t70 -4t58.5 -0.5t56.5 2t44.5 2q68 1 95 -27q6 -6 10 -9q26 -25 61 -51q7 91 118 87q5 0 36.5 -1.5t43 -2t45.5 -1t53 1t54.5 4.5t61 8.5t62 13.5t67 19.5t67.5 27t72 34.5z
+M1763 621q-121 -149 -372 -252q84 -285 -23 -465q-66 -113 -183 -148q-104 -32 -182 15q-86 51 -82 164l-1 326v1q-8 2 -24.5 6t-23.5 5l-1 -338q4 -114 -83 -164q-79 -47 -183 -15q-117 36 -182 150q-105 180 -22 463q-251 103 -372 252q-25 37 -4 63t60 -1q4 -2 11.5 -7
+t10.5 -8v694q0 72 47 123t114 51h1257q67 0 114 -51t47 -123v-694l21 15q39 27 60 1t-4 -63z" />
+    <glyph glyph-name="_458" unicode="&#xf1e8;" horiz-adv-x="1792" 
+d="M896 1102v-434h-145v434h145zM1294 1102v-434h-145v434h145zM1294 342l253 254v795h-1194v-1049h326v-217l217 217h398zM1692 1536v-1013l-434 -434h-326l-217 -217h-217v217h-398v1158l109 289h1483z" />
+    <glyph glyph-name="_459" unicode="&#xf1e9;" 
+d="M773 217v-127q-1 -292 -6 -305q-12 -32 -51 -40q-54 -9 -181.5 38t-162.5 89q-13 15 -17 36q-1 12 4 26q4 10 34 47t181 216q1 0 60 70q15 19 39.5 24.5t49.5 -3.5q24 -10 37.5 -29t12.5 -42zM624 468q-3 -55 -52 -70l-120 -39q-275 -88 -292 -88q-35 2 -54 36
+q-12 25 -17 75q-8 76 1 166.5t30 124.5t56 32q13 0 202 -77q71 -29 115 -47l84 -34q23 -9 35.5 -30.5t11.5 -48.5zM1450 171q-7 -54 -91.5 -161t-135.5 -127q-37 -14 -63 7q-14 10 -184 287l-47 77q-14 21 -11.5 46t19.5 46q35 43 83 26q1 -1 119 -40q203 -66 242 -79.5
+t47 -20.5q28 -22 22 -61zM778 803q5 -102 -54 -122q-58 -17 -114 71l-378 598q-8 35 19 62q41 43 207.5 89.5t224.5 31.5q40 -10 49 -45q3 -18 22 -305.5t24 -379.5zM1440 695q3 -39 -26 -59q-15 -10 -329 -86q-67 -15 -91 -23l1 2q-23 -6 -46 4t-37 32q-30 47 0 87
+q1 1 75 102q125 171 150 204t34 39q28 19 65 2q48 -23 123 -133.5t81 -167.5v-3z" />
+    <glyph glyph-name="_460" unicode="&#xf1ea;" horiz-adv-x="2048" 
+d="M1024 1024h-384v-384h384v384zM1152 384v-128h-640v128h640zM1152 1152v-640h-640v640h640zM1792 384v-128h-512v128h512zM1792 640v-128h-512v128h512zM1792 896v-128h-512v128h512zM1792 1152v-128h-512v128h512zM256 192v960h-128v-960q0 -26 19 -45t45 -19t45 19
+t19 45zM1920 192v1088h-1536v-1088q0 -33 -11 -64h1483q26 0 45 19t19 45zM2048 1408v-1216q0 -80 -56 -136t-136 -56h-1664q-80 0 -136 56t-56 136v1088h256v128h1792z" />
+    <glyph glyph-name="_461" unicode="&#xf1eb;" horiz-adv-x="2048" 
+d="M1024 13q-20 0 -93 73.5t-73 93.5q0 32 62.5 54t103.5 22t103.5 -22t62.5 -54q0 -20 -73 -93.5t-93 -73.5zM1294 284q-2 0 -40 25t-101.5 50t-128.5 25t-128.5 -25t-101 -50t-40.5 -25q-18 0 -93.5 75t-75.5 93q0 13 10 23q78 77 196 121t233 44t233 -44t196 -121
+q10 -10 10 -23q0 -18 -75.5 -93t-93.5 -75zM1567 556q-11 0 -23 8q-136 105 -252 154.5t-268 49.5q-85 0 -170.5 -22t-149 -53t-113.5 -62t-79 -53t-31 -22q-17 0 -92 75t-75 93q0 12 10 22q132 132 320 205t380 73t380 -73t320 -205q10 -10 10 -22q0 -18 -75 -93t-92 -75z
+M1838 827q-11 0 -22 9q-179 157 -371.5 236.5t-420.5 79.5t-420.5 -79.5t-371.5 -236.5q-11 -9 -22 -9q-17 0 -92.5 75t-75.5 93q0 13 10 23q187 186 445 288t527 102t527 -102t445 -288q10 -10 10 -23q0 -18 -75.5 -93t-92.5 -75z" />
+    <glyph glyph-name="_462" unicode="&#xf1ec;" horiz-adv-x="1792" 
+d="M384 0q0 53 -37.5 90.5t-90.5 37.5t-90.5 -37.5t-37.5 -90.5t37.5 -90.5t90.5 -37.5t90.5 37.5t37.5 90.5zM768 0q0 53 -37.5 90.5t-90.5 37.5t-90.5 -37.5t-37.5 -90.5t37.5 -90.5t90.5 -37.5t90.5 37.5t37.5 90.5zM384 384q0 53 -37.5 90.5t-90.5 37.5t-90.5 -37.5
+t-37.5 -90.5t37.5 -90.5t90.5 -37.5t90.5 37.5t37.5 90.5zM1152 0q0 53 -37.5 90.5t-90.5 37.5t-90.5 -37.5t-37.5 -90.5t37.5 -90.5t90.5 -37.5t90.5 37.5t37.5 90.5zM768 384q0 53 -37.5 90.5t-90.5 37.5t-90.5 -37.5t-37.5 -90.5t37.5 -90.5t90.5 -37.5t90.5 37.5
+t37.5 90.5zM384 768q0 53 -37.5 90.5t-90.5 37.5t-90.5 -37.5t-37.5 -90.5t37.5 -90.5t90.5 -37.5t90.5 37.5t37.5 90.5zM1152 384q0 53 -37.5 90.5t-90.5 37.5t-90.5 -37.5t-37.5 -90.5t37.5 -90.5t90.5 -37.5t90.5 37.5t37.5 90.5zM768 768q0 53 -37.5 90.5t-90.5 37.5
+t-90.5 -37.5t-37.5 -90.5t37.5 -90.5t90.5 -37.5t90.5 37.5t37.5 90.5zM1536 0v384q0 52 -38 90t-90 38t-90 -38t-38 -90v-384q0 -52 38 -90t90 -38t90 38t38 90zM1152 768q0 53 -37.5 90.5t-90.5 37.5t-90.5 -37.5t-37.5 -90.5t37.5 -90.5t90.5 -37.5t90.5 37.5t37.5 90.5z
+M1536 1088v256q0 26 -19 45t-45 19h-1280q-26 0 -45 -19t-19 -45v-256q0 -26 19 -45t45 -19h1280q26 0 45 19t19 45zM1536 768q0 53 -37.5 90.5t-90.5 37.5t-90.5 -37.5t-37.5 -90.5t37.5 -90.5t90.5 -37.5t90.5 37.5t37.5 90.5zM1664 1408v-1536q0 -52 -38 -90t-90 -38
+h-1408q-52 0 -90 38t-38 90v1536q0 52 38 90t90 38h1408q52 0 90 -38t38 -90z" />
+    <glyph glyph-name="_463" unicode="&#xf1ed;" 
+d="M1519 890q18 -84 -4 -204q-87 -444 -565 -444h-44q-25 0 -44 -16.5t-24 -42.5l-4 -19l-55 -346l-2 -15q-5 -26 -24.5 -42.5t-44.5 -16.5h-251q-21 0 -33 15t-9 36q9 56 26.5 168t26.5 168t27 167.5t27 167.5q5 37 43 37h131q133 -2 236 21q175 39 287 144q102 95 155 246
+q24 70 35 133q1 6 2.5 7.5t3.5 1t6 -3.5q79 -59 98 -162zM1347 1172q0 -107 -46 -236q-80 -233 -302 -315q-113 -40 -252 -42q0 -1 -90 -1l-90 1q-100 0 -118 -96q-2 -8 -85 -530q-1 -10 -12 -10h-295q-22 0 -36.5 16.5t-11.5 38.5l232 1471q5 29 27.5 48t51.5 19h598
+q34 0 97.5 -13t111.5 -32q107 -41 163.5 -123t56.5 -196z" />
+    <glyph glyph-name="_464" unicode="&#xf1ee;" horiz-adv-x="1792" 
+d="M441 864q33 0 52 -26q266 -364 362 -774h-446q-127 441 -367 749q-12 16 -3 33.5t29 17.5h373zM1000 507q-49 -199 -125 -393q-79 310 -256 594q40 221 44 449q211 -340 337 -650zM1099 1216q235 -324 384.5 -698.5t184.5 -773.5h-451q-41 665 -553 1472h435zM1792 640
+q0 -424 -101 -812q-67 560 -359 1083q-25 301 -106 584q-4 16 5.5 28.5t25.5 12.5h359q21 0 38.5 -13t22.5 -33q115 -409 115 -850z" />
+    <glyph glyph-name="uniF1F0" unicode="&#xf1f0;" horiz-adv-x="2304" 
+d="M1975 546h-138q14 37 66 179l3 9q4 10 10 26t9 26l12 -55zM531 611l-58 295q-11 54 -75 54h-268l-2 -13q311 -79 403 -336zM710 960l-162 -438l-17 89q-26 70 -85 129.5t-131 88.5l135 -510h175l261 641h-176zM849 318h166l104 642h-166zM1617 944q-69 27 -149 27
+q-123 0 -201 -59t-79 -153q-1 -102 145 -174q48 -23 67 -41t19 -39q0 -30 -30 -46t-69 -16q-86 0 -156 33l-22 11l-23 -144q74 -34 185 -34q130 -1 208.5 59t80.5 160q0 106 -140 174q-49 25 -71 42t-22 38q0 22 24.5 38.5t70.5 16.5q70 1 124 -24l15 -8zM2042 960h-128
+q-65 0 -87 -54l-246 -588h174l35 96h212q5 -22 20 -96h154zM2304 1280v-1280q0 -52 -38 -90t-90 -38h-2048q-52 0 -90 38t-38 90v1280q0 52 38 90t90 38h2048q52 0 90 -38t38 -90z" />
+    <glyph glyph-name="_466" unicode="&#xf1f1;" horiz-adv-x="2304" 
+d="M1119 1195q-128 85 -281 85q-103 0 -197.5 -40.5t-162.5 -108.5t-108.5 -162t-40.5 -197q0 -104 40.5 -198t108.5 -162t162 -108.5t198 -40.5q153 0 281 85q-131 107 -178 265.5t0.5 316.5t177.5 265zM1152 1171q-126 -99 -172 -249.5t-0.5 -300.5t172.5 -249
+q127 99 172.5 249t-0.5 300.5t-172 249.5zM1185 1195q130 -107 177.5 -265.5t0.5 -317t-178 -264.5q128 -85 281 -85q104 0 198 40.5t162 108.5t108.5 162t40.5 198q0 103 -40.5 197t-108.5 162t-162.5 108.5t-197.5 40.5q-153 0 -281 -85zM1926 473h7v3h-17v-3h7v-17h3v17z
+M1955 456h4v20h-5l-6 -13l-6 13h-5v-20h3v15l6 -13h4l5 13v-15zM1947 16v-2h-2h-3v3h3h2v-1zM1947 7h3l-4 5h2l1 1q1 1 1 3t-1 3l-1 1h-3h-6v-13h3v5h1zM685 75q0 19 11 31t30 12q18 0 29 -12.5t11 -30.5q0 -19 -11 -31t-29 -12q-19 0 -30 12t-11 31zM1158 119q30 0 35 -32
+h-70q5 32 35 32zM1514 75q0 19 11 31t29 12t29.5 -12.5t11.5 -30.5q0 -19 -11 -31t-30 -12q-18 0 -29 12t-11 31zM1786 75q0 18 11.5 30.5t29.5 12.5t29.5 -12.5t11.5 -30.5q0 -19 -11.5 -31t-29.5 -12t-29.5 12.5t-11.5 30.5zM1944 3q-2 0 -4 1q-1 0 -3 2t-2 3q-1 2 -1 4
+q0 3 1 4q0 2 2 4l1 1q2 0 2 1q2 1 4 1q3 0 4 -1l4 -2l2 -4v-1q1 -2 1 -3l-1 -1v-3t-1 -1l-1 -2q-2 -2 -4 -2q-1 -1 -4 -1zM599 7h30v85q0 24 -14.5 38.5t-39.5 15.5q-32 0 -47 -24q-14 24 -45 24q-24 0 -39 -20v16h-30v-135h30v75q0 36 33 36q30 0 30 -36v-75h29v75
+q0 36 33 36q30 0 30 -36v-75zM765 7h29v68v67h-29v-16q-17 20 -43 20q-29 0 -48 -20t-19 -51t19 -51t48 -20q28 0 43 20v-17zM943 48q0 34 -47 40l-14 2q-23 4 -23 14q0 15 25 15q23 0 43 -11l12 24q-22 14 -55 14q-26 0 -41 -12t-15 -32q0 -33 47 -39l13 -2q24 -4 24 -14
+q0 -17 -31 -17q-25 0 -45 14l-13 -23q25 -17 58 -17q29 0 45.5 12t16.5 32zM1073 14l-8 25q-13 -7 -26 -7q-19 0 -19 22v61h48v27h-48v41h-30v-41h-28v-27h28v-61q0 -50 47 -50q21 0 36 10zM1159 146q-29 0 -48 -20t-19 -51q0 -32 19.5 -51.5t49.5 -19.5q33 0 55 19l-14 22
+q-18 -15 -39 -15q-34 0 -41 33h101v12q0 32 -18 51.5t-46 19.5zM1318 146q-23 0 -35 -20v16h-30v-135h30v76q0 35 29 35q10 0 18 -4l9 28q-9 4 -21 4zM1348 75q0 -31 19.5 -51t52.5 -20q29 0 48 16l-14 24q-18 -13 -35 -12q-18 0 -29.5 12t-11.5 31t11.5 31t29.5 12
+q19 0 35 -12l14 24q-20 16 -48 16q-33 0 -52.5 -20t-19.5 -51zM1593 7h30v68v67h-30v-16q-15 20 -42 20q-29 0 -48.5 -20t-19.5 -51t19.5 -51t48.5 -20q28 0 42 20v-17zM1726 146q-23 0 -35 -20v16h-29v-135h29v76q0 35 29 35q10 0 18 -4l9 28q-8 4 -21 4zM1866 7h29v68v122
+h-29v-71q-15 20 -43 20t-47.5 -20.5t-19.5 -50.5t19.5 -50.5t47.5 -20.5q29 0 43 20v-17zM1944 27l-2 -1h-3q-2 -1 -4 -3q-3 -1 -3 -4q-1 -2 -1 -6q0 -3 1 -5q0 -2 3 -4q2 -2 4 -3t5 -1q4 0 6 1q0 1 2 2l2 1q1 1 3 4q1 2 1 5q0 4 -1 6q-1 1 -3 4q0 1 -2 2l-2 1q-1 0 -3 0.5
+t-3 0.5zM2304 1280v-1280q0 -52 -38 -90t-90 -38h-2048q-52 0 -90 38t-38 90v1280q0 52 38 90t90 38h2048q52 0 90 -38t38 -90z" />
+    <glyph glyph-name="_467" unicode="&#xf1f2;" horiz-adv-x="2304" 
+d="M313 759q0 -51 -36 -84q-29 -26 -89 -26h-17v220h17q61 0 89 -27q36 -31 36 -83zM2089 824q0 -52 -64 -52h-19v101h20q63 0 63 -49zM380 759q0 74 -50 120.5t-129 46.5h-95v-333h95q74 0 119 38q60 51 60 128zM410 593h65v333h-65v-333zM730 694q0 40 -20.5 62t-75.5 42
+q-29 10 -39.5 19t-10.5 23q0 16 13.5 26.5t34.5 10.5q29 0 53 -27l34 44q-41 37 -98 37q-44 0 -74 -27.5t-30 -67.5q0 -35 18 -55.5t64 -36.5q37 -13 45 -19q19 -12 19 -34q0 -20 -14 -33.5t-36 -13.5q-48 0 -71 44l-42 -40q44 -64 115 -64q51 0 83 30.5t32 79.5zM1008 604
+v77q-37 -37 -78 -37q-49 0 -80.5 32.5t-31.5 82.5q0 48 31.5 81.5t77.5 33.5q43 0 81 -38v77q-40 20 -80 20q-74 0 -125.5 -50.5t-51.5 -123.5t51 -123.5t125 -50.5q42 0 81 19zM2240 0v527q-65 -40 -144.5 -84t-237.5 -117t-329.5 -137.5t-417.5 -134.5t-504 -118h1569
+q26 0 45 19t19 45zM1389 757q0 75 -53 128t-128 53t-128 -53t-53 -128t53 -128t128 -53t128 53t53 128zM1541 584l144 342h-71l-90 -224l-89 224h-71l142 -342h35zM1714 593h184v56h-119v90h115v56h-115v74h119v57h-184v-333zM2105 593h80l-105 140q76 16 76 94q0 47 -31 73
+t-87 26h-97v-333h65v133h9zM2304 1274v-1268q0 -56 -38.5 -95t-93.5 -39h-2040q-55 0 -93.5 39t-38.5 95v1268q0 56 38.5 95t93.5 39h2040q55 0 93.5 -39t38.5 -95z" />
+    <glyph glyph-name="f1f3" unicode="&#xf1f3;" horiz-adv-x="2304" 
+d="M119 854h89l-45 108zM740 328l74 79l-70 79h-163v-49h142v-55h-142v-54h159zM898 406l99 -110v217zM1186 453q0 33 -40 33h-84v-69h83q41 0 41 36zM1475 457q0 29 -42 29h-82v-61h81q43 0 43 32zM1197 923q0 29 -42 29h-82v-60h81q43 0 43 31zM1656 854h89l-44 108z
+M699 1009v-271h-66v212l-94 -212h-57l-94 212v-212h-132l-25 60h-135l-25 -60h-70l116 271h96l110 -257v257h106l85 -184l77 184h108zM1255 453q0 -20 -5.5 -35t-14 -25t-22.5 -16.5t-26 -10t-31.5 -4.5t-31.5 -1t-32.5 0.5t-29.5 0.5v-91h-126l-80 90l-83 -90h-256v271h260
+l80 -89l82 89h207q109 0 109 -89zM964 794v-56h-217v271h217v-57h-152v-49h148v-55h-148v-54h152zM2304 235v-229q0 -55 -38.5 -94.5t-93.5 -39.5h-2040q-55 0 -93.5 39.5t-38.5 94.5v678h111l25 61h55l25 -61h218v46l19 -46h113l20 47v-47h541v99l10 1q10 0 10 -14v-86h279
+v23q23 -12 55 -18t52.5 -6.5t63 0.5t51.5 1l25 61h56l25 -61h227v58l34 -58h182v378h-180v-44l-25 44h-185v-44l-23 44h-249q-69 0 -109 -22v22h-172v-22q-24 22 -73 22h-628l-43 -97l-43 97h-198v-44l-22 44h-169l-78 -179v391q0 55 38.5 94.5t93.5 39.5h2040
+q55 0 93.5 -39.5t38.5 -94.5v-678h-120q-51 0 -81 -22v22h-177q-55 0 -78 -22v22h-316v-22q-31 22 -87 22h-209v-22q-23 22 -91 22h-234l-54 -58l-50 58h-349v-378h343l55 59l52 -59h211v89h21q59 0 90 13v-102h174v99h8q8 0 10 -2t2 -10v-87h529q57 0 88 24v-24h168
+q60 0 95 17zM1546 469q0 -23 -12 -43t-34 -29q25 -9 34 -26t9 -46v-54h-65v45q0 33 -12 43.5t-46 10.5h-69v-99h-65v271h154q48 0 77 -15t29 -58zM1269 936q0 -24 -12.5 -44t-33.5 -29q26 -9 34.5 -25.5t8.5 -46.5v-53h-65q0 9 0.5 26.5t0 25t-3 18.5t-8.5 16t-17.5 8.5
+t-29.5 3.5h-70v-98h-64v271l153 -1q49 0 78 -14.5t29 -57.5zM1798 327v-56h-216v271h216v-56h-151v-49h148v-55h-148v-54zM1372 1009v-271h-66v271h66zM2065 357q0 -86 -102 -86h-126v58h126q34 0 34 25q0 16 -17 21t-41.5 5t-49.5 3.5t-42 22.5t-17 55q0 39 26 60t66 21
+h130v-57h-119q-36 0 -36 -25q0 -16 17.5 -20.5t42 -4t49 -2.5t42 -21.5t17.5 -54.5zM2304 407v-101q-24 -35 -88 -35h-125v58h125q33 0 33 25q0 13 -12.5 19t-31 5.5t-40 2t-40 8t-31 24t-12.5 48.5q0 39 26.5 60t66.5 21h129v-57h-118q-36 0 -36 -25q0 -20 29 -22t68.5 -5
+t56.5 -26zM2139 1008v-270h-92l-122 203v-203h-132l-26 60h-134l-25 -60h-75q-129 0 -129 133q0 138 133 138h63v-59q-7 0 -28 1t-28.5 0.5t-23 -2t-21.5 -6.5t-14.5 -13.5t-11.5 -23t-3 -33.5q0 -38 13.5 -58t49.5 -20h29l92 213h97l109 -256v256h99l114 -188v188h66z" />
+    <glyph glyph-name="_469" unicode="&#xf1f4;" horiz-adv-x="2304" 
+d="M745 630q0 -37 -25.5 -61.5t-62.5 -24.5q-29 0 -46.5 16t-17.5 44q0 37 25 62.5t62 25.5q28 0 46.5 -16.5t18.5 -45.5zM1530 779q0 -42 -22 -57t-66 -15l-32 -1l17 107q2 11 13 11h18q22 0 35 -2t25 -12.5t12 -30.5zM1881 630q0 -36 -25.5 -61t-61.5 -25q-29 0 -47 16
+t-18 44q0 37 25 62.5t62 25.5q28 0 46.5 -16.5t18.5 -45.5zM513 801q0 59 -38.5 85.5t-100.5 26.5h-160q-19 0 -21 -19l-65 -408q-1 -6 3 -11t10 -5h76q20 0 22 19l18 110q1 8 7 13t15 6.5t17 1.5t19 -1t14 -1q86 0 135 48.5t49 134.5zM822 489l41 261q1 6 -3 11t-10 5h-76
+q-14 0 -17 -33q-27 40 -95 40q-72 0 -122.5 -54t-50.5 -127q0 -59 34.5 -94t92.5 -35q28 0 58 12t48 32q-4 -12 -4 -21q0 -16 13 -16h69q19 0 22 19zM1269 752q0 5 -4 9.5t-9 4.5h-77q-11 0 -18 -10l-106 -156l-44 150q-5 16 -22 16h-75q-5 0 -9 -4.5t-4 -9.5q0 -2 19.5 -59
+t42 -123t23.5 -70q-82 -112 -82 -120q0 -13 13 -13h77q11 0 18 10l255 368q2 2 2 7zM1649 801q0 59 -38.5 85.5t-100.5 26.5h-159q-20 0 -22 -19l-65 -408q-1 -6 3 -11t10 -5h82q12 0 16 13l18 116q1 8 7 13t15 6.5t17 1.5t19 -1t14 -1q86 0 135 48.5t49 134.5zM1958 489
+l41 261q1 6 -3 11t-10 5h-76q-14 0 -17 -33q-26 40 -95 40q-72 0 -122.5 -54t-50.5 -127q0 -59 34.5 -94t92.5 -35q29 0 59 12t47 32q0 -1 -2 -9t-2 -12q0 -16 13 -16h69q19 0 22 19zM2176 898v1q0 14 -13 14h-74q-11 0 -13 -11l-65 -416l-1 -2q0 -5 4 -9.5t10 -4.5h66
+q19 0 21 19zM392 764q-5 -35 -26 -46t-60 -11l-33 -1l17 107q2 11 13 11h19q40 0 58 -11.5t12 -48.5zM2304 1280v-1280q0 -52 -38 -90t-90 -38h-2048q-52 0 -90 38t-38 90v1280q0 52 38 90t90 38h2048q52 0 90 -38t38 -90z" />
+    <glyph glyph-name="_470" unicode="&#xf1f5;" horiz-adv-x="2304" 
+d="M1597 633q0 -69 -21 -106q-19 -35 -52 -35q-23 0 -41 9v224q29 30 57 30q57 0 57 -122zM2035 669h-110q6 98 56 98q51 0 54 -98zM476 534q0 59 -33 91.5t-101 57.5q-36 13 -52 24t-16 25q0 26 38 26q58 0 124 -33l18 112q-67 32 -149 32q-77 0 -123 -38q-48 -39 -48 -109
+q0 -58 32.5 -90.5t99.5 -56.5q39 -14 54.5 -25.5t15.5 -27.5q0 -31 -48 -31q-29 0 -70 12.5t-72 30.5l-18 -113q72 -41 168 -41q81 0 129 37q51 41 51 117zM771 749l19 111h-96v135l-129 -21l-18 -114l-46 -8l-17 -103h62v-219q0 -84 44 -120q38 -30 111 -30q32 0 79 11v118
+q-32 -7 -44 -7q-42 0 -42 50v197h77zM1087 724v139q-15 3 -28 3q-32 0 -55.5 -16t-33.5 -46l-10 56h-131v-471h150v306q26 31 82 31q16 0 26 -2zM1124 389h150v471h-150v-471zM1746 638q0 122 -45 179q-40 52 -111 52q-64 0 -117 -56l-8 47h-132v-645l150 25v151
+q36 -11 68 -11q83 0 134 56q61 65 61 202zM1278 986q0 33 -23 56t-56 23t-56 -23t-23 -56t23 -56.5t56 -23.5t56 23.5t23 56.5zM2176 629q0 113 -48 176q-50 64 -144 64q-96 0 -151.5 -66t-55.5 -180q0 -128 63 -188q55 -55 161 -55q101 0 160 40l-16 103q-57 -31 -128 -31
+q-43 0 -63 19q-23 19 -28 66h248q2 14 2 52zM2304 1280v-1280q0 -52 -38 -90t-90 -38h-2048q-52 0 -90 38t-38 90v1280q0 52 38 90t90 38h2048q52 0 90 -38t38 -90z" />
+    <glyph glyph-name="_471" unicode="&#xf1f6;" horiz-adv-x="2048" 
+d="M1558 684q61 -356 298 -556q0 -52 -38 -90t-90 -38h-448q0 -106 -75 -181t-181 -75t-180.5 74.5t-75.5 180.5zM1024 -176q16 0 16 16t-16 16q-59 0 -101.5 42.5t-42.5 101.5q0 16 -16 16t-16 -16q0 -73 51.5 -124.5t124.5 -51.5zM2026 1424q8 -10 7.5 -23.5t-10.5 -22.5
+l-1872 -1622q-10 -8 -23.5 -7t-21.5 11l-84 96q-8 10 -7.5 23.5t10.5 21.5l186 161q-19 32 -19 66q50 42 91 88t85 119.5t74.5 158.5t50 206t19.5 260q0 152 117 282.5t307 158.5q-8 19 -8 39q0 40 28 68t68 28t68 -28t28 -68q0 -20 -8 -39q124 -18 219 -82.5t148 -157.5
+l418 363q10 8 23.5 7t21.5 -11z" />
+    <glyph glyph-name="_472" unicode="&#xf1f7;" horiz-adv-x="2048" 
+d="M1040 -160q0 16 -16 16q-59 0 -101.5 42.5t-42.5 101.5q0 16 -16 16t-16 -16q0 -73 51.5 -124.5t124.5 -51.5q16 0 16 16zM503 315l877 760q-42 88 -132.5 146.5t-223.5 58.5q-93 0 -169.5 -31.5t-121.5 -80.5t-69 -103t-24 -105q0 -384 -137 -645zM1856 128
+q0 -52 -38 -90t-90 -38h-448q0 -106 -75 -181t-181 -75t-180.5 74.5t-75.5 180.5l149 129h757q-166 187 -227 459l111 97q61 -356 298 -556zM1942 1520l84 -96q8 -10 7.5 -23.5t-10.5 -22.5l-1872 -1622q-10 -8 -23.5 -7t-21.5 11l-84 96q-8 10 -7.5 23.5t10.5 21.5l186 161
+q-19 32 -19 66q50 42 91 88t85 119.5t74.5 158.5t50 206t19.5 260q0 152 117 282.5t307 158.5q-8 19 -8 39q0 40 28 68t68 28t68 -28t28 -68q0 -20 -8 -39q124 -18 219 -82.5t148 -157.5l418 363q10 8 23.5 7t21.5 -11z" />
+    <glyph glyph-name="_473" unicode="&#xf1f8;" horiz-adv-x="1408" 
+d="M512 160v704q0 14 -9 23t-23 9h-64q-14 0 -23 -9t-9 -23v-704q0 -14 9 -23t23 -9h64q14 0 23 9t9 23zM768 160v704q0 14 -9 23t-23 9h-64q-14 0 -23 -9t-9 -23v-704q0 -14 9 -23t23 -9h64q14 0 23 9t9 23zM1024 160v704q0 14 -9 23t-23 9h-64q-14 0 -23 -9t-9 -23v-704
+q0 -14 9 -23t23 -9h64q14 0 23 9t9 23zM480 1152h448l-48 117q-7 9 -17 11h-317q-10 -2 -17 -11zM1408 1120v-64q0 -14 -9 -23t-23 -9h-96v-948q0 -83 -47 -143.5t-113 -60.5h-832q-66 0 -113 58.5t-47 141.5v952h-96q-14 0 -23 9t-9 23v64q0 14 9 23t23 9h309l70 167
+q15 37 54 63t79 26h320q40 0 79 -26t54 -63l70 -167h309q14 0 23 -9t9 -23z" />
+    <glyph glyph-name="_474" unicode="&#xf1f9;" 
+d="M1150 462v-109q0 -50 -36.5 -89t-94 -60.5t-118 -32.5t-117.5 -11q-205 0 -342.5 139t-137.5 346q0 203 136 339t339 136q34 0 75.5 -4.5t93 -18t92.5 -34t69 -56.5t28 -81v-109q0 -16 -16 -16h-118q-16 0 -16 16v70q0 43 -65.5 67.5t-137.5 24.5q-140 0 -228.5 -91.5
+t-88.5 -237.5q0 -151 91.5 -249.5t233.5 -98.5q68 0 138 24t70 66v70q0 7 4.5 11.5t10.5 4.5h119q6 0 11 -4.5t5 -11.5zM768 1280q-130 0 -248.5 -51t-204 -136.5t-136.5 -204t-51 -248.5t51 -248.5t136.5 -204t204 -136.5t248.5 -51t248.5 51t204 136.5t136.5 204t51 248.5
+t-51 248.5t-136.5 204t-204 136.5t-248.5 51zM1536 640q0 -209 -103 -385.5t-279.5 -279.5t-385.5 -103t-385.5 103t-279.5 279.5t-103 385.5t103 385.5t279.5 279.5t385.5 103t385.5 -103t279.5 -279.5t103 -385.5z" />
+    <glyph glyph-name="_475" unicode="&#xf1fa;" 
+d="M972 761q0 108 -53.5 169t-147.5 61q-63 0 -124 -30.5t-110 -84.5t-79.5 -137t-30.5 -180q0 -112 53.5 -173t150.5 -61q96 0 176 66.5t122.5 166t42.5 203.5zM1536 640q0 -111 -37 -197t-98.5 -135t-131.5 -74.5t-145 -27.5q-6 0 -15.5 -0.5t-16.5 -0.5q-95 0 -142 53
+q-28 33 -33 83q-52 -66 -131.5 -110t-173.5 -44q-161 0 -249.5 95.5t-88.5 269.5q0 157 66 290t179 210.5t246 77.5q87 0 155 -35.5t106 -99.5l2 19l11 56q1 6 5.5 12t9.5 6h118q5 0 13 -11q5 -5 3 -16l-120 -614q-5 -24 -5 -48q0 -39 12.5 -52t44.5 -13q28 1 57 5.5t73 24
+t77 50t57 89.5t24 137q0 292 -174 466t-466 174q-130 0 -248.5 -51t-204 -136.5t-136.5 -204t-51 -248.5t51 -248.5t136.5 -204t204 -136.5t248.5 -51q228 0 405 144q11 9 24 8t21 -12l41 -49q8 -12 7 -24q-2 -13 -12 -22q-102 -83 -227.5 -128t-258.5 -45q-156 0 -298 61
+t-245 164t-164 245t-61 298t61 298t164 245t245 164t298 61q344 0 556 -212t212 -556z" />
+    <glyph glyph-name="_476" unicode="&#xf1fb;" horiz-adv-x="1792" 
+d="M1698 1442q94 -94 94 -226.5t-94 -225.5l-225 -223l104 -104q10 -10 10 -23t-10 -23l-210 -210q-10 -10 -23 -10t-23 10l-105 105l-603 -603q-37 -37 -90 -37h-203l-256 -128l-64 64l128 256v203q0 53 37 90l603 603l-105 105q-10 10 -10 23t10 23l210 210q10 10 23 10
+t23 -10l104 -104l223 225q93 94 225.5 94t226.5 -94zM512 64l576 576l-192 192l-576 -576v-192h192z" />
+    <glyph glyph-name="f1fc" unicode="&#xf1fc;" horiz-adv-x="1792" 
+d="M1615 1536q70 0 122.5 -46.5t52.5 -116.5q0 -63 -45 -151q-332 -629 -465 -752q-97 -91 -218 -91q-126 0 -216.5 92.5t-90.5 219.5q0 128 92 212l638 579q59 54 130 54zM706 502q39 -76 106.5 -130t150.5 -76l1 -71q4 -213 -129.5 -347t-348.5 -134q-123 0 -218 46.5
+t-152.5 127.5t-86.5 183t-29 220q7 -5 41 -30t62 -44.5t59 -36.5t46 -17q41 0 55 37q25 66 57.5 112.5t69.5 76t88 47.5t103 25.5t125 10.5z" />
+    <glyph glyph-name="_478" unicode="&#xf1fd;" horiz-adv-x="1792" 
+d="M1792 128v-384h-1792v384q45 0 85 14t59 27.5t47 37.5q30 27 51.5 38t56.5 11q24 0 44 -7t31 -15t33 -27q29 -25 47 -38t58 -27t86 -14q45 0 85 14.5t58 27t48 37.5q21 19 32.5 27t31 15t43.5 7q35 0 56.5 -11t51.5 -38q28 -24 47 -37.5t59 -27.5t85 -14t85 14t59 27.5
+t47 37.5q30 27 51.5 38t56.5 11q34 0 55.5 -11t51.5 -38q28 -24 47 -37.5t59 -27.5t85 -14zM1792 448v-192q-24 0 -44 7t-31 15t-33 27q-29 25 -47 38t-58 27t-85 14q-46 0 -86 -14t-58 -27t-47 -38q-22 -19 -33 -27t-31 -15t-44 -7q-35 0 -56.5 11t-51.5 38q-29 25 -47 38
+t-58 27t-86 14q-45 0 -85 -14.5t-58 -27t-48 -37.5q-21 -19 -32.5 -27t-31 -15t-43.5 -7q-35 0 -56.5 11t-51.5 38q-28 24 -47 37.5t-59 27.5t-85 14q-46 0 -86 -14t-58 -27t-47 -38q-30 -27 -51.5 -38t-56.5 -11v192q0 80 56 136t136 56h64v448h256v-448h256v448h256v-448
+h256v448h256v-448h64q80 0 136 -56t56 -136zM512 1312q0 -77 -36 -118.5t-92 -41.5q-53 0 -90.5 37.5t-37.5 90.5q0 29 9.5 51t23.5 34t31 28t31 31.5t23.5 44.5t9.5 67q38 0 83 -74t45 -150zM1024 1312q0 -77 -36 -118.5t-92 -41.5q-53 0 -90.5 37.5t-37.5 90.5
+q0 29 9.5 51t23.5 34t31 28t31 31.5t23.5 44.5t9.5 67q38 0 83 -74t45 -150zM1536 1312q0 -77 -36 -118.5t-92 -41.5q-53 0 -90.5 37.5t-37.5 90.5q0 29 9.5 51t23.5 34t31 28t31 31.5t23.5 44.5t9.5 67q38 0 83 -74t45 -150z" />
+    <glyph glyph-name="_479" unicode="&#xf1fe;" horiz-adv-x="2048" 
+d="M2048 0v-128h-2048v1536h128v-1408h1920zM1664 1024l256 -896h-1664v576l448 576l576 -576z" />
+    <glyph glyph-name="_480" unicode="&#xf200;" horiz-adv-x="1792" 
+d="M768 646l546 -546q-106 -108 -247.5 -168t-298.5 -60q-209 0 -385.5 103t-279.5 279.5t-103 385.5t103 385.5t279.5 279.5t385.5 103v-762zM955 640h773q0 -157 -60 -298.5t-168 -247.5zM1664 768h-768v768q209 0 385.5 -103t279.5 -279.5t103 -385.5z" />
+    <glyph glyph-name="_481" unicode="&#xf201;" horiz-adv-x="2048" 
+d="M2048 0v-128h-2048v1536h128v-1408h1920zM1920 1248v-435q0 -21 -19.5 -29.5t-35.5 7.5l-121 121l-633 -633q-10 -10 -23 -10t-23 10l-233 233l-416 -416l-192 192l585 585q10 10 23 10t23 -10l233 -233l464 464l-121 121q-16 16 -7.5 35.5t29.5 19.5h435q14 0 23 -9
+t9 -23z" />
+    <glyph glyph-name="_482" unicode="&#xf202;" horiz-adv-x="1792" 
+d="M1292 832q0 -6 10 -41q10 -29 25 -49.5t41 -34t44 -20t55 -16.5q325 -91 325 -332q0 -146 -105.5 -242.5t-254.5 -96.5q-59 0 -111.5 18.5t-91.5 45.5t-77 74.5t-63 87.5t-53.5 103.5t-43.5 103t-39.5 106.5t-35.5 95q-32 81 -61.5 133.5t-73.5 96.5t-104 64t-142 20
+q-96 0 -183 -55.5t-138 -144.5t-51 -185q0 -160 106.5 -279.5t263.5 -119.5q177 0 258 95q56 63 83 116l84 -152q-15 -34 -44 -70l1 -1q-131 -152 -388 -152q-147 0 -269.5 79t-190.5 207.5t-68 274.5q0 105 43.5 206t116 176.5t172 121.5t204.5 46q87 0 159 -19t123.5 -50
+t95 -80t72.5 -99t58.5 -117t50.5 -124.5t50 -130.5t55 -127q96 -200 233 -200q81 0 138.5 48.5t57.5 128.5q0 42 -19 72t-50.5 46t-72.5 31.5t-84.5 27t-87.5 34t-81 52t-65 82t-39 122.5q-3 16 -3 33q0 110 87.5 192t198.5 78q78 -3 120.5 -14.5t90.5 -53.5h-1
+q12 -11 23 -24.5t26 -36t19 -27.5l-129 -99q-26 49 -54 70v1q-23 21 -97 21q-49 0 -84 -33t-35 -83z" />
+    <glyph glyph-name="_483" unicode="&#xf203;" 
+d="M1432 484q0 173 -234 239q-35 10 -53 16.5t-38 25t-29 46.5q0 2 -2 8.5t-3 12t-1 7.5q0 36 24.5 59.5t60.5 23.5q54 0 71 -15h-1q20 -15 39 -51l93 71q-39 54 -49 64q-33 29 -67.5 39t-85.5 10q-80 0 -142 -57.5t-62 -137.5q0 -7 2 -23q16 -96 64.5 -140t148.5 -73
+q29 -8 49 -15.5t45 -21.5t38.5 -34.5t13.5 -46.5v-5q1 -58 -40.5 -93t-100.5 -35q-97 0 -167 144q-23 47 -51.5 121.5t-48 125.5t-54 110.5t-74 95.5t-103.5 60.5t-147 24.5q-101 0 -192 -56t-144 -148t-50 -192v-1q4 -108 50.5 -199t133.5 -147.5t196 -56.5q186 0 279 110
+q20 27 31 51l-60 109q-42 -80 -99 -116t-146 -36q-115 0 -191 87t-76 204q0 105 82 189t186 84q112 0 170 -53.5t104 -172.5q8 -21 25.5 -68.5t28.5 -76.5t31.5 -74.5t38.5 -74t45.5 -62.5t55.5 -53.5t66 -33t80 -13.5q107 0 183 69.5t76 174.5zM1536 1120v-960
+q0 -119 -84.5 -203.5t-203.5 -84.5h-960q-119 0 -203.5 84.5t-84.5 203.5v960q0 119 84.5 203.5t203.5 84.5h960q119 0 203.5 -84.5t84.5 -203.5z" />
+    <glyph glyph-name="_484" unicode="&#xf204;" horiz-adv-x="2048" 
+d="M1152 640q0 104 -40.5 198.5t-109.5 163.5t-163.5 109.5t-198.5 40.5t-198.5 -40.5t-163.5 -109.5t-109.5 -163.5t-40.5 -198.5t40.5 -198.5t109.5 -163.5t163.5 -109.5t198.5 -40.5t198.5 40.5t163.5 109.5t109.5 163.5t40.5 198.5zM1920 640q0 104 -40.5 198.5
+t-109.5 163.5t-163.5 109.5t-198.5 40.5h-386q119 -90 188.5 -224t69.5 -288t-69.5 -288t-188.5 -224h386q104 0 198.5 40.5t163.5 109.5t109.5 163.5t40.5 198.5zM2048 640q0 -130 -51 -248.5t-136.5 -204t-204 -136.5t-248.5 -51h-768q-130 0 -248.5 51t-204 136.5
+t-136.5 204t-51 248.5t51 248.5t136.5 204t204 136.5t248.5 51h768q130 0 248.5 -51t204 -136.5t136.5 -204t51 -248.5z" />
+    <glyph glyph-name="_485" unicode="&#xf205;" horiz-adv-x="2048" 
+d="M0 640q0 130 51 248.5t136.5 204t204 136.5t248.5 51h768q130 0 248.5 -51t204 -136.5t136.5 -204t51 -248.5t-51 -248.5t-136.5 -204t-204 -136.5t-248.5 -51h-768q-130 0 -248.5 51t-204 136.5t-136.5 204t-51 248.5zM1408 128q104 0 198.5 40.5t163.5 109.5
+t109.5 163.5t40.5 198.5t-40.5 198.5t-109.5 163.5t-163.5 109.5t-198.5 40.5t-198.5 -40.5t-163.5 -109.5t-109.5 -163.5t-40.5 -198.5t40.5 -198.5t109.5 -163.5t163.5 -109.5t198.5 -40.5z" />
+    <glyph glyph-name="_486" unicode="&#xf206;" horiz-adv-x="2304" 
+d="M762 384h-314q-40 0 -57.5 35t6.5 67l188 251q-65 31 -137 31q-132 0 -226 -94t-94 -226t94 -226t226 -94q115 0 203 72.5t111 183.5zM576 512h186q-18 85 -75 148zM1056 512l288 384h-480l-99 -132q105 -103 126 -252h165zM2176 448q0 132 -94 226t-226 94
+q-60 0 -121 -24l174 -260q15 -23 10 -49t-27 -40q-15 -11 -36 -11q-35 0 -53 29l-174 260q-93 -95 -93 -225q0 -132 94 -226t226 -94t226 94t94 226zM2304 448q0 -185 -131.5 -316.5t-316.5 -131.5t-316.5 131.5t-131.5 316.5q0 97 39.5 183.5t109.5 149.5l-65 98l-353 -469
+q-18 -26 -51 -26h-197q-23 -164 -149 -274t-294 -110q-185 0 -316.5 131.5t-131.5 316.5t131.5 316.5t316.5 131.5q114 0 215 -55l137 183h-224q-26 0 -45 19t-19 45t19 45t45 19h384v-128h435l-85 128h-222q-26 0 -45 19t-19 45t19 45t45 19h256q33 0 53 -28l267 -400
+q91 44 192 44q185 0 316.5 -131.5t131.5 -316.5z" />
+    <glyph glyph-name="_487" unicode="&#xf207;" 
+d="M384 320q0 53 -37.5 90.5t-90.5 37.5t-90.5 -37.5t-37.5 -90.5t37.5 -90.5t90.5 -37.5t90.5 37.5t37.5 90.5zM1408 320q0 53 -37.5 90.5t-90.5 37.5t-90.5 -37.5t-37.5 -90.5t37.5 -90.5t90.5 -37.5t90.5 37.5t37.5 90.5zM1362 716l-72 384q-5 23 -22.5 37.5t-40.5 14.5
+h-918q-23 0 -40.5 -14.5t-22.5 -37.5l-72 -384q-5 -30 14 -53t49 -23h1062q30 0 49 23t14 53zM1136 1328q0 20 -14 34t-34 14h-640q-20 0 -34 -14t-14 -34t14 -34t34 -14h640q20 0 34 14t14 34zM1536 603v-603h-128v-128q0 -53 -37.5 -90.5t-90.5 -37.5t-90.5 37.5
+t-37.5 90.5v128h-768v-128q0 -53 -37.5 -90.5t-90.5 -37.5t-90.5 37.5t-37.5 90.5v128h-128v603q0 112 25 223l103 454q9 78 97.5 137t230 89t312.5 30t312.5 -30t230 -89t97.5 -137l105 -454q23 -102 23 -223z" />
+    <glyph glyph-name="_488" unicode="&#xf208;" horiz-adv-x="2048" 
+d="M1463 704q0 -35 -25 -60.5t-61 -25.5h-702q-36 0 -61 25.5t-25 60.5t25 60.5t61 25.5h702q36 0 61 -25.5t25 -60.5zM1677 704q0 86 -23 170h-982q-36 0 -61 25t-25 60q0 36 25 61t61 25h908q-88 143 -235 227t-320 84q-177 0 -327.5 -87.5t-238 -237.5t-87.5 -327
+q0 -86 23 -170h982q36 0 61 -25t25 -60q0 -36 -25 -61t-61 -25h-908q88 -143 235.5 -227t320.5 -84q132 0 253 51.5t208 139t139 208t52 253.5zM2048 959q0 -35 -25 -60t-61 -25h-131q17 -85 17 -170q0 -167 -65.5 -319.5t-175.5 -263t-262.5 -176t-319.5 -65.5
+q-246 0 -448.5 133t-301.5 350h-189q-36 0 -61 25t-25 61q0 35 25 60t61 25h132q-17 85 -17 170q0 167 65.5 319.5t175.5 263t262.5 176t320.5 65.5q245 0 447.5 -133t301.5 -350h188q36 0 61 -25t25 -61z" />
+    <glyph glyph-name="_489" unicode="&#xf209;" horiz-adv-x="1280" 
+d="M953 1158l-114 -328l117 -21q165 451 165 518q0 56 -38 56q-57 0 -130 -225zM654 471l33 -88q37 42 71 67l-33 5.5t-38.5 7t-32.5 8.5zM362 1367q0 -98 159 -521q17 10 49 10q15 0 75 -5l-121 351q-75 220 -123 220q-19 0 -29 -17.5t-10 -37.5zM283 608q0 -36 51.5 -119
+t117.5 -153t100 -70q14 0 25.5 13t11.5 27q0 24 -32 102q-13 32 -32 72t-47.5 89t-61.5 81t-62 32q-20 0 -45.5 -27t-25.5 -47zM125 273q0 -41 25 -104q59 -145 183.5 -227t281.5 -82q227 0 382 170q152 169 152 427q0 43 -1 67t-11.5 62t-30.5 56q-56 49 -211.5 75.5
+t-270.5 26.5q-37 0 -49 -11q-12 -5 -12 -35q0 -34 21.5 -60t55.5 -40t77.5 -23.5t87.5 -11.5t85 -4t70 0h23q24 0 40 -19q15 -19 19 -55q-28 -28 -96 -54q-61 -22 -93 -46q-64 -46 -108.5 -114t-44.5 -137q0 -31 18.5 -88.5t18.5 -87.5l-3 -12q-4 -12 -4 -14
+q-137 10 -146 216q-8 -2 -41 -2q2 -7 2 -21q0 -53 -40.5 -89.5t-94.5 -36.5q-82 0 -166.5 78t-84.5 159q0 34 33 67q52 -64 60 -76q77 -104 133 -104q12 0 26.5 8.5t14.5 20.5q0 34 -87.5 145t-116.5 111q-43 0 -70 -44.5t-27 -90.5zM11 264q0 101 42.5 163t136.5 88
+q-28 74 -28 104q0 62 61 123t122 61q29 0 70 -15q-163 462 -163 567q0 80 41 130.5t119 50.5q131 0 325 -581q6 -17 8 -23q6 16 29 79.5t43.5 118.5t54 127.5t64.5 123t70.5 86.5t76.5 36q71 0 112 -49t41 -122q0 -108 -159 -550q61 -15 100.5 -46t58.5 -78t26 -93.5
+t7 -110.5q0 -150 -47 -280t-132 -225t-211 -150t-278 -55q-111 0 -223 42q-149 57 -258 191.5t-109 286.5z" />
+    <glyph glyph-name="_490" unicode="&#xf20a;" horiz-adv-x="2048" 
+d="M785 528h207q-14 -158 -98.5 -248.5t-214.5 -90.5q-162 0 -254.5 116t-92.5 316q0 194 93 311.5t233 117.5q148 0 232 -87t97 -247h-203q-5 64 -35.5 99t-81.5 35q-57 0 -88.5 -60.5t-31.5 -177.5q0 -48 5 -84t18 -69.5t40 -51.5t66 -18q95 0 109 139zM1497 528h206
+q-14 -158 -98 -248.5t-214 -90.5q-162 0 -254.5 116t-92.5 316q0 194 93 311.5t233 117.5q148 0 232 -87t97 -247h-204q-4 64 -35 99t-81 35q-57 0 -88.5 -60.5t-31.5 -177.5q0 -48 5 -84t18 -69.5t39.5 -51.5t65.5 -18q49 0 76.5 38t33.5 101zM1856 647q0 207 -15.5 307
+t-60.5 161q-6 8 -13.5 14t-21.5 15t-16 11q-86 63 -697 63q-625 0 -710 -63q-5 -4 -17.5 -11.5t-21 -14t-14.5 -14.5q-45 -60 -60 -159.5t-15 -308.5q0 -208 15 -307.5t60 -160.5q6 -8 15 -15t20.5 -14t17.5 -12q44 -33 239.5 -49t470.5 -16q610 0 697 65q5 4 17 11t20.5 14
+t13.5 16q46 60 61 159t15 309zM2048 1408v-1536h-2048v1536h2048z" />
+    <glyph glyph-name="_491" unicode="&#xf20b;" 
+d="M992 912v-496q0 -14 -9 -23t-23 -9h-160q-14 0 -23 9t-9 23v496q0 112 -80 192t-192 80h-272v-1152q0 -14 -9 -23t-23 -9h-160q-14 0 -23 9t-9 23v1344q0 14 9 23t23 9h464q135 0 249 -66.5t180.5 -180.5t66.5 -249zM1376 1376v-880q0 -135 -66.5 -249t-180.5 -180.5
+t-249 -66.5h-464q-14 0 -23 9t-9 23v960q0 14 9 23t23 9h160q14 0 23 -9t9 -23v-768h272q112 0 192 80t80 192v880q0 14 9 23t23 9h160q14 0 23 -9t9 -23z" />
+    <glyph glyph-name="_492" unicode="&#xf20c;" 
+d="M1311 694v-114q0 -24 -13.5 -38t-37.5 -14h-202q-24 0 -38 14t-14 38v114q0 24 14 38t38 14h202q24 0 37.5 -14t13.5 -38zM821 464v250q0 53 -32.5 85.5t-85.5 32.5h-133q-68 0 -96 -52q-28 52 -96 52h-130q-53 0 -85.5 -32.5t-32.5 -85.5v-250q0 -22 21 -22h55
+q22 0 22 22v230q0 24 13.5 38t38.5 14h94q24 0 38 -14t14 -38v-230q0 -22 21 -22h54q22 0 22 22v230q0 24 14 38t38 14h97q24 0 37.5 -14t13.5 -38v-230q0 -22 22 -22h55q21 0 21 22zM1410 560v154q0 53 -33 85.5t-86 32.5h-264q-53 0 -86 -32.5t-33 -85.5v-410
+q0 -21 22 -21h55q21 0 21 21v180q31 -42 94 -42h191q53 0 86 32.5t33 85.5zM1536 1176v-1072q0 -96 -68 -164t-164 -68h-1072q-96 0 -164 68t-68 164v1072q0 96 68 164t164 68h1072q96 0 164 -68t68 -164z" />
+    <glyph glyph-name="_493" unicode="&#xf20d;" 
+d="M915 450h-294l147 551zM1001 128h311l-324 1024h-440l-324 -1024h311l383 314zM1536 1120v-960q0 -118 -85 -203t-203 -85h-960q-118 0 -203 85t-85 203v960q0 118 85 203t203 85h960q118 0 203 -85t85 -203z" />
+    <glyph glyph-name="_494" unicode="&#xf20e;" horiz-adv-x="2048" 
+d="M2048 641q0 -21 -13 -36.5t-33 -19.5l-205 -356q3 -9 3 -18q0 -20 -12.5 -35.5t-32.5 -19.5l-193 -337q3 -8 3 -16q0 -23 -16.5 -40t-40.5 -17q-25 0 -41 18h-400q-17 -20 -43 -20t-43 20h-399q-17 -20 -43 -20q-23 0 -40 16.5t-17 40.5q0 8 4 20l-193 335
+q-20 4 -32.5 19.5t-12.5 35.5q0 9 3 18l-206 356q-20 5 -32.5 20.5t-12.5 35.5q0 21 13.5 36.5t33.5 19.5l199 344q0 1 -0.5 3t-0.5 3q0 36 34 51l209 363q-4 10 -4 18q0 24 17 40.5t40 16.5q26 0 44 -21h396q16 21 43 21t43 -21h398q18 21 44 21q23 0 40 -16.5t17 -40.5
+q0 -6 -4 -18l207 -358q23 -1 39 -17.5t16 -38.5q0 -13 -7 -27l187 -324q19 -4 31.5 -19.5t12.5 -35.5zM1063 -158h389l-342 354h-143l-342 -354h360q18 16 39 16t39 -16zM112 654q1 -4 1 -13q0 -10 -2 -15l208 -360l15 -6l188 199v347l-187 194q-13 -8 -29 -10zM986 1438
+h-388l190 -200l554 200h-280q-16 -16 -38 -16t-38 16zM1689 226q1 6 5 11l-64 68l-17 -79h76zM1583 226l22 105l-252 266l-296 -307l63 -64h463zM1495 -142l16 28l65 310h-427l333 -343q8 4 13 5zM578 -158h5l342 354h-373v-335l4 -6q14 -5 22 -13zM552 226h402l64 66
+l-309 321l-157 -166v-221zM359 226h163v189l-168 -177q4 -8 5 -12zM358 1051q0 -1 0.5 -2t0.5 -2q0 -16 -8 -29l171 -177v269zM552 1121v-311l153 -157l297 314l-223 236zM556 1425l-4 -8v-264l205 74l-191 201q-6 -2 -10 -3zM1447 1438h-16l-621 -224l213 -225zM1023 946
+l-297 -315l311 -319l296 307zM688 634l-136 141v-284zM1038 270l-42 -44h85zM1374 618l238 -251l132 624l-3 5l-1 1zM1718 1018q-8 13 -8 29v2l-216 376q-5 1 -13 5l-437 -463l310 -327zM522 1142v223l-163 -282zM522 196h-163l163 -283v283zM1607 196l-48 -227l130 227h-82
+zM1729 266l207 361q-2 10 -2 14q0 1 3 16l-171 296l-129 -612l77 -82q5 3 15 7z" />
+    <glyph glyph-name="f210" unicode="&#xf210;" 
+d="M0 856q0 131 91.5 226.5t222.5 95.5h742l352 358v-1470q0 -132 -91.5 -227t-222.5 -95h-780q-131 0 -222.5 95t-91.5 227v790zM1232 102l-176 180v425q0 46 -32 79t-78 33h-484q-46 0 -78 -33t-32 -79v-492q0 -46 32.5 -79.5t77.5 -33.5h770z" />
+    <glyph glyph-name="_496" unicode="&#xf211;" 
+d="M934 1386q-317 -121 -556 -362.5t-358 -560.5q-20 89 -20 176q0 208 102.5 384.5t278.5 279t384 102.5q82 0 169 -19zM1203 1267q93 -65 164 -155q-389 -113 -674.5 -400.5t-396.5 -676.5q-93 72 -155 162q112 386 395 671t667 399zM470 -67q115 356 379.5 622t619.5 384
+q40 -92 54 -195q-292 -120 -516 -345t-343 -518q-103 14 -194 52zM1536 -125q-193 50 -367 115q-135 -84 -290 -107q109 205 274 370.5t369 275.5q-21 -152 -101 -284q65 -175 115 -370z" />
+    <glyph glyph-name="f212" unicode="&#xf212;" horiz-adv-x="2048" 
+d="M1893 1144l155 -1272q-131 0 -257 57q-200 91 -393 91q-226 0 -374 -148q-148 148 -374 148q-193 0 -393 -91q-128 -57 -252 -57h-5l155 1272q224 127 482 127q233 0 387 -106q154 106 387 106q258 0 482 -127zM1398 157q129 0 232 -28.5t260 -93.5l-124 1021
+q-171 78 -368 78q-224 0 -374 -141q-150 141 -374 141q-197 0 -368 -78l-124 -1021q105 43 165.5 65t148.5 39.5t178 17.5q202 0 374 -108q172 108 374 108zM1438 191l-55 907q-211 -4 -359 -155q-152 155 -374 155q-176 0 -336 -66l-114 -941q124 51 228.5 76t221.5 25
+q209 0 374 -102q172 107 374 102z" />
+    <glyph glyph-name="_498" unicode="&#xf213;" horiz-adv-x="2048" 
+d="M1500 165v733q0 21 -15 36t-35 15h-93q-20 0 -35 -15t-15 -36v-733q0 -20 15 -35t35 -15h93q20 0 35 15t15 35zM1216 165v531q0 20 -15 35t-35 15h-101q-20 0 -35 -15t-15 -35v-531q0 -20 15 -35t35 -15h101q20 0 35 15t15 35zM924 165v429q0 20 -15 35t-35 15h-101
+q-20 0 -35 -15t-15 -35v-429q0 -20 15 -35t35 -15h101q20 0 35 15t15 35zM632 165v362q0 20 -15 35t-35 15h-101q-20 0 -35 -15t-15 -35v-362q0 -20 15 -35t35 -15h101q20 0 35 15t15 35zM2048 311q0 -166 -118 -284t-284 -118h-1244q-166 0 -284 118t-118 284
+q0 116 63 214.5t168 148.5q-10 34 -10 73q0 113 80.5 193.5t193.5 80.5q102 0 180 -67q45 183 194 300t338 117q149 0 275 -73.5t199.5 -199.5t73.5 -275q0 -66 -14 -122q135 -33 221 -142.5t86 -247.5z" />
+    <glyph glyph-name="_499" unicode="&#xf214;" 
+d="M0 1536h1536v-1392l-776 -338l-760 338v1392zM1436 209v926h-1336v-926l661 -294zM1436 1235v201h-1336v-201h1336zM181 937v-115h-37v115h37zM181 789v-115h-37v115h37zM181 641v-115h-37v115h37zM181 493v-115h-37v115h37zM181 345v-115h-37v115h37zM207 202l15 34
+l105 -47l-15 -33zM343 142l15 34l105 -46l-15 -34zM478 82l15 34l105 -46l-15 -34zM614 23l15 33l104 -46l-15 -34zM797 10l105 46l15 -33l-105 -47zM932 70l105 46l15 -34l-105 -46zM1068 130l105 46l15 -34l-105 -46zM1203 189l105 47l15 -34l-105 -46zM259 1389v-36h-114
+v36h114zM421 1389v-36h-115v36h115zM583 1389v-36h-115v36h115zM744 1389v-36h-114v36h114zM906 1389v-36h-114v36h114zM1068 1389v-36h-115v36h115zM1230 1389v-36h-115v36h115zM1391 1389v-36h-114v36h114zM181 1049v-79h-37v115h115v-36h-78zM421 1085v-36h-115v36h115z
+M583 1085v-36h-115v36h115zM744 1085v-36h-114v36h114zM906 1085v-36h-114v36h114zM1068 1085v-36h-115v36h115zM1230 1085v-36h-115v36h115zM1355 970v79h-78v36h115v-115h-37zM1355 822v115h37v-115h-37zM1355 674v115h37v-115h-37zM1355 526v115h37v-115h-37zM1355 378
+v115h37v-115h-37zM1355 230v115h37v-115h-37zM760 265q-129 0 -221 91.5t-92 221.5q0 129 92 221t221 92q130 0 221.5 -92t91.5 -221q0 -130 -91.5 -221.5t-221.5 -91.5zM595 646q0 -36 19.5 -56.5t49.5 -25t64 -7t64 -2t49.5 -9t19.5 -30.5q0 -49 -112 -49q-97 0 -123 51
+h-3l-31 -63q67 -42 162 -42q29 0 56.5 5t55.5 16t45.5 33t17.5 53q0 46 -27.5 69.5t-67.5 27t-79.5 3t-67 5t-27.5 25.5q0 21 20.5 33t40.5 15t41 3q34 0 70.5 -11t51.5 -34h3l30 58q-3 1 -21 8.5t-22.5 9t-19.5 7t-22 7t-20 4.5t-24 4t-23 1q-29 0 -56.5 -5t-54 -16.5
+t-43 -34t-16.5 -53.5z" />
+    <glyph glyph-name="_500" unicode="&#xf215;" horiz-adv-x="2048" 
+d="M863 504q0 112 -79.5 191.5t-191.5 79.5t-191 -79.5t-79 -191.5t79 -191t191 -79t191.5 79t79.5 191zM1726 505q0 112 -79 191t-191 79t-191.5 -79t-79.5 -191q0 -113 79.5 -192t191.5 -79t191 79.5t79 191.5zM2048 1314v-1348q0 -44 -31.5 -75.5t-76.5 -31.5h-1832
+q-45 0 -76.5 31.5t-31.5 75.5v1348q0 44 31.5 75.5t76.5 31.5h431q44 0 76 -31.5t32 -75.5v-161h754v161q0 44 32 75.5t76 31.5h431q45 0 76.5 -31.5t31.5 -75.5z" />
+    <glyph glyph-name="_501" unicode="&#xf216;" horiz-adv-x="2048" 
+d="M1430 953zM1690 749q148 0 253 -98.5t105 -244.5q0 -157 -109 -261.5t-267 -104.5q-85 0 -162 27.5t-138 73.5t-118 106t-109 126t-103.5 132.5t-108.5 126.5t-117 106t-136 73.5t-159 27.5q-154 0 -251.5 -91.5t-97.5 -244.5q0 -157 104 -250t263 -93q100 0 208 37.5
+t193 98.5q5 4 21 18.5t30 24t22 9.5q14 0 24.5 -10.5t10.5 -24.5q0 -24 -60 -77q-101 -88 -234.5 -142t-260.5 -54q-133 0 -245.5 58t-180 165t-67.5 241q0 205 141.5 341t347.5 136q120 0 226.5 -43.5t185.5 -113t151.5 -153t139 -167.5t133.5 -153.5t149.5 -113
+t172.5 -43.5q102 0 168.5 61.5t66.5 162.5q0 95 -64.5 159t-159.5 64q-30 0 -81.5 -18.5t-68.5 -18.5q-20 0 -35.5 15t-15.5 35q0 18 8.5 57t8.5 59q0 159 -107.5 263t-266.5 104q-58 0 -111.5 -18.5t-84 -40.5t-55.5 -40.5t-33 -18.5q-15 0 -25.5 10.5t-10.5 25.5
+q0 19 25 46q59 67 147 103.5t182 36.5q191 0 318 -125.5t127 -315.5q0 -37 -4 -66q57 15 115 15z" />
+    <glyph glyph-name="_502" unicode="&#xf217;" horiz-adv-x="1664" 
+d="M1216 832q0 26 -19 45t-45 19h-128v128q0 26 -19 45t-45 19t-45 -19t-19 -45v-128h-128q-26 0 -45 -19t-19 -45t19 -45t45 -19h128v-128q0 -26 19 -45t45 -19t45 19t19 45v128h128q26 0 45 19t19 45zM640 0q0 -53 -37.5 -90.5t-90.5 -37.5t-90.5 37.5t-37.5 90.5
+t37.5 90.5t90.5 37.5t90.5 -37.5t37.5 -90.5zM1536 0q0 -53 -37.5 -90.5t-90.5 -37.5t-90.5 37.5t-37.5 90.5t37.5 90.5t90.5 37.5t90.5 -37.5t37.5 -90.5zM1664 1088v-512q0 -24 -16 -42.5t-41 -21.5l-1044 -122q1 -7 4.5 -21.5t6 -26.5t2.5 -22q0 -16 -24 -64h920
+q26 0 45 -19t19 -45t-19 -45t-45 -19h-1024q-26 0 -45 19t-19 45q0 14 11 39.5t29.5 59.5t20.5 38l-177 823h-204q-26 0 -45 19t-19 45t19 45t45 19h256q16 0 28.5 -6.5t20 -15.5t13 -24.5t7.5 -26.5t5.5 -29.5t4.5 -25.5h1201q26 0 45 -19t19 -45z" />
+    <glyph glyph-name="_503" unicode="&#xf218;" horiz-adv-x="1664" 
+d="M1280 832q0 26 -19 45t-45 19t-45 -19l-147 -146v293q0 26 -19 45t-45 19t-45 -19t-19 -45v-293l-147 146q-19 19 -45 19t-45 -19t-19 -45t19 -45l256 -256q19 -19 45 -19t45 19l256 256q19 19 19 45zM640 0q0 -53 -37.5 -90.5t-90.5 -37.5t-90.5 37.5t-37.5 90.5
+t37.5 90.5t90.5 37.5t90.5 -37.5t37.5 -90.5zM1536 0q0 -53 -37.5 -90.5t-90.5 -37.5t-90.5 37.5t-37.5 90.5t37.5 90.5t90.5 37.5t90.5 -37.5t37.5 -90.5zM1664 1088v-512q0 -24 -16 -42.5t-41 -21.5l-1044 -122q1 -7 4.5 -21.5t6 -26.5t2.5 -22q0 -16 -24 -64h920
+q26 0 45 -19t19 -45t-19 -45t-45 -19h-1024q-26 0 -45 19t-19 45q0 14 11 39.5t29.5 59.5t20.5 38l-177 823h-204q-26 0 -45 19t-19 45t19 45t45 19h256q16 0 28.5 -6.5t20 -15.5t13 -24.5t7.5 -26.5t5.5 -29.5t4.5 -25.5h1201q26 0 45 -19t19 -45z" />
+    <glyph glyph-name="_504" unicode="&#xf219;" horiz-adv-x="2048" 
+d="M212 768l623 -665l-300 665h-323zM1024 -4l349 772h-698zM538 896l204 384h-262l-288 -384h346zM1213 103l623 665h-323zM683 896h682l-204 384h-274zM1510 896h346l-288 384h-262zM1651 1382l384 -512q14 -18 13 -41.5t-17 -40.5l-960 -1024q-18 -20 -47 -20t-47 20
+l-960 1024q-16 17 -17 40.5t13 41.5l384 512q18 26 51 26h1152q33 0 51 -26z" />
+    <glyph glyph-name="_505" unicode="&#xf21a;" horiz-adv-x="2048" 
+d="M1811 -19q19 19 45 19t45 -19l128 -128l-90 -90l-83 83l-83 -83q-18 -19 -45 -19t-45 19l-83 83l-83 -83q-19 -19 -45 -19t-45 19l-83 83l-83 -83q-19 -19 -45 -19t-45 19l-83 83l-83 -83q-19 -19 -45 -19t-45 19l-83 83l-83 -83q-19 -19 -45 -19t-45 19l-83 83l-83 -83
+q-19 -19 -45 -19t-45 19l-83 83l-83 -83q-19 -19 -45 -19t-45 19l-128 128l90 90l83 -83l83 83q19 19 45 19t45 -19l83 -83l83 83q19 19 45 19t45 -19l83 -83l83 83q19 19 45 19t45 -19l83 -83l83 83q19 19 45 19t45 -19l83 -83l83 83q19 19 45 19t45 -19l83 -83l83 83
+q19 19 45 19t45 -19l83 -83zM237 19q-19 -19 -45 -19t-45 19l-128 128l90 90l83 -82l83 82q19 19 45 19t45 -19l83 -82l64 64v293l-210 314q-17 26 -7 56.5t40 40.5l177 58v299h128v128h256v128h256v-128h256v-128h128v-299l177 -58q30 -10 40 -40.5t-7 -56.5l-210 -314
+v-293l19 18q19 19 45 19t45 -19l83 -82l83 82q19 19 45 19t45 -19l128 -128l-90 -90l-83 83l-83 -83q-18 -19 -45 -19t-45 19l-83 83l-83 -83q-19 -19 -45 -19t-45 19l-83 83l-83 -83q-19 -19 -45 -19t-45 19l-83 83l-83 -83q-19 -19 -45 -19t-45 19l-83 83l-83 -83
+q-19 -19 -45 -19t-45 19l-83 83l-83 -83q-19 -19 -45 -19t-45 19l-83 83zM640 1152v-128l384 128l384 -128v128h-128v128h-512v-128h-128z" />
+    <glyph glyph-name="_506" unicode="&#xf21b;" 
+d="M576 0l96 448l-96 128l-128 64zM832 0l128 640l-128 -64l-96 -128zM992 1010q-2 4 -4 6q-10 8 -96 8q-70 0 -167 -19q-7 -2 -21 -2t-21 2q-97 19 -167 19q-86 0 -96 -8q-2 -2 -4 -6q2 -18 4 -27q2 -3 7.5 -6.5t7.5 -10.5q2 -4 7.5 -20.5t7 -20.5t7.5 -17t8.5 -17t9 -14
+t12 -13.5t14 -9.5t17.5 -8t20.5 -4t24.5 -2q36 0 59 12.5t32.5 30t14.5 34.5t11.5 29.5t17.5 12.5h12q11 0 17.5 -12.5t11.5 -29.5t14.5 -34.5t32.5 -30t59 -12.5q13 0 24.5 2t20.5 4t17.5 8t14 9.5t12 13.5t9 14t8.5 17t7.5 17t7 20.5t7.5 20.5q2 7 7.5 10.5t7.5 6.5
+q2 9 4 27zM1408 131q0 -121 -73 -190t-194 -69h-874q-121 0 -194 69t-73 190q0 61 4.5 118t19 125.5t37.5 123.5t63.5 103.5t93.5 74.5l-90 220h214q-22 64 -22 128q0 12 2 32q-194 40 -194 96q0 57 210 99q17 62 51.5 134t70.5 114q32 37 76 37q30 0 84 -31t84 -31t84 31
+t84 31q44 0 76 -37q36 -42 70.5 -114t51.5 -134q210 -42 210 -99q0 -56 -194 -96q7 -81 -20 -160h214l-82 -225q63 -33 107.5 -96.5t65.5 -143.5t29 -151.5t8 -148.5z" />
+    <glyph glyph-name="_507" unicode="&#xf21c;" horiz-adv-x="2304" 
+d="M2301 500q12 -103 -22 -198.5t-99 -163.5t-158.5 -106t-196.5 -31q-161 11 -279.5 125t-134.5 274q-12 111 27.5 210.5t118.5 170.5l-71 107q-96 -80 -151 -194t-55 -244q0 -27 -18.5 -46.5t-45.5 -19.5h-256h-69q-23 -164 -149 -274t-294 -110q-185 0 -316.5 131.5
+t-131.5 316.5t131.5 316.5t316.5 131.5q76 0 152 -27l24 45q-123 110 -304 110h-64q-26 0 -45 19t-19 45t19 45t45 19h128q78 0 145 -13.5t116.5 -38.5t71.5 -39.5t51 -36.5h512h115l-85 128h-222q-30 0 -49 22.5t-14 52.5q4 23 23 38t43 15h253q33 0 53 -28l70 -105
+l114 114q19 19 46 19h101q26 0 45 -19t19 -45v-128q0 -26 -19 -45t-45 -19h-179l115 -172q131 63 275 36q143 -26 244 -134.5t118 -253.5zM448 128q115 0 203 72.5t111 183.5h-314q-35 0 -55 31q-18 32 -1 63l147 277q-47 13 -91 13q-132 0 -226 -94t-94 -226t94 -226
+t226 -94zM1856 128q132 0 226 94t94 226t-94 226t-226 94q-60 0 -121 -24l174 -260q15 -23 10 -49t-27 -40q-15 -11 -36 -11q-35 0 -53 29l-174 260q-93 -95 -93 -225q0 -132 94 -226t226 -94z" />
+    <glyph glyph-name="_508" unicode="&#xf21d;" 
+d="M1408 0q0 -63 -61.5 -113.5t-164 -81t-225 -46t-253.5 -15.5t-253.5 15.5t-225 46t-164 81t-61.5 113.5q0 49 33 88.5t91 66.5t118 44.5t131 29.5q26 5 48 -10.5t26 -41.5q5 -26 -10.5 -48t-41.5 -26q-58 -10 -106 -23.5t-76.5 -25.5t-48.5 -23.5t-27.5 -19.5t-8.5 -12
+q3 -11 27 -26.5t73 -33t114 -32.5t160.5 -25t201.5 -10t201.5 10t160.5 25t114 33t73 33.5t27 27.5q-1 4 -8.5 11t-27.5 19t-48.5 23.5t-76.5 25t-106 23.5q-26 4 -41.5 26t-10.5 48q4 26 26 41.5t48 10.5q71 -12 131 -29.5t118 -44.5t91 -66.5t33 -88.5zM1024 896v-384
+q0 -26 -19 -45t-45 -19h-64v-384q0 -26 -19 -45t-45 -19h-256q-26 0 -45 19t-19 45v384h-64q-26 0 -45 19t-19 45v384q0 53 37.5 90.5t90.5 37.5h384q53 0 90.5 -37.5t37.5 -90.5zM928 1280q0 -93 -65.5 -158.5t-158.5 -65.5t-158.5 65.5t-65.5 158.5t65.5 158.5t158.5 65.5
+t158.5 -65.5t65.5 -158.5z" />
+    <glyph glyph-name="_509" unicode="&#xf21e;" horiz-adv-x="1792" 
+d="M1280 512h305q-5 -6 -10 -10.5t-9 -7.5l-3 -4l-623 -600q-18 -18 -44 -18t-44 18l-624 602q-5 2 -21 20h369q22 0 39.5 13.5t22.5 34.5l70 281l190 -667q6 -20 23 -33t39 -13q21 0 38 13t23 33l146 485l56 -112q18 -35 57 -35zM1792 940q0 -145 -103 -300h-369l-111 221
+q-8 17 -25.5 27t-36.5 8q-45 -5 -56 -46l-129 -430l-196 686q-6 20 -23.5 33t-39.5 13t-39 -13.5t-22 -34.5l-116 -464h-423q-103 155 -103 300q0 220 127 344t351 124q62 0 126.5 -21.5t120 -58t95.5 -68.5t76 -68q36 36 76 68t95.5 68.5t120 58t126.5 21.5q224 0 351 -124
+t127 -344z" />
+    <glyph glyph-name="venus" unicode="&#xf221;" horiz-adv-x="1280" 
+d="M1152 960q0 -221 -147.5 -384.5t-364.5 -187.5v-260h224q14 0 23 -9t9 -23v-64q0 -14 -9 -23t-23 -9h-224v-224q0 -14 -9 -23t-23 -9h-64q-14 0 -23 9t-9 23v224h-224q-14 0 -23 9t-9 23v64q0 14 9 23t23 9h224v260q-150 16 -271.5 103t-186 224t-52.5 292
+q11 134 80.5 249t182 188t245.5 88q170 19 319 -54t236 -212t87 -306zM128 960q0 -185 131.5 -316.5t316.5 -131.5t316.5 131.5t131.5 316.5t-131.5 316.5t-316.5 131.5t-316.5 -131.5t-131.5 -316.5z" />
+    <glyph glyph-name="_511" unicode="&#xf222;" 
+d="M1472 1408q26 0 45 -19t19 -45v-416q0 -14 -9 -23t-23 -9h-64q-14 0 -23 9t-9 23v262l-382 -383q126 -156 126 -359q0 -117 -45.5 -223.5t-123 -184t-184 -123t-223.5 -45.5t-223.5 45.5t-184 123t-123 184t-45.5 223.5t45.5 223.5t123 184t184 123t223.5 45.5
+q203 0 359 -126l382 382h-261q-14 0 -23 9t-9 23v64q0 14 9 23t23 9h416zM576 0q185 0 316.5 131.5t131.5 316.5t-131.5 316.5t-316.5 131.5t-316.5 -131.5t-131.5 -316.5t131.5 -316.5t316.5 -131.5z" />
+    <glyph glyph-name="_512" unicode="&#xf223;" horiz-adv-x="1280" 
+d="M830 1220q145 -72 233.5 -210.5t88.5 -305.5q0 -221 -147.5 -384.5t-364.5 -187.5v-132h96q14 0 23 -9t9 -23v-64q0 -14 -9 -23t-23 -9h-96v-96q0 -14 -9 -23t-23 -9h-64q-14 0 -23 9t-9 23v96h-96q-14 0 -23 9t-9 23v64q0 14 9 23t23 9h96v132q-217 24 -364.5 187.5
+t-147.5 384.5q0 167 88.5 305.5t233.5 210.5q-165 96 -228 273q-6 16 3.5 29.5t26.5 13.5h69q21 0 29 -20q44 -106 140 -171t214 -65t214 65t140 171q8 20 37 20h61q17 0 26.5 -13.5t3.5 -29.5q-63 -177 -228 -273zM576 256q185 0 316.5 131.5t131.5 316.5t-131.5 316.5
+t-316.5 131.5t-316.5 -131.5t-131.5 -316.5t131.5 -316.5t316.5 -131.5z" />
+    <glyph glyph-name="_513" unicode="&#xf224;" 
+d="M1024 1504q0 14 9 23t23 9h288q26 0 45 -19t19 -45v-288q0 -14 -9 -23t-23 -9h-64q-14 0 -23 9t-9 23v134l-254 -255q126 -158 126 -359q0 -221 -147.5 -384.5t-364.5 -187.5v-132h96q14 0 23 -9t9 -23v-64q0 -14 -9 -23t-23 -9h-96v-96q0 -14 -9 -23t-23 -9h-64
+q-14 0 -23 9t-9 23v96h-96q-14 0 -23 9t-9 23v64q0 14 9 23t23 9h96v132q-149 16 -270.5 103t-186.5 223.5t-53 291.5q16 204 160 353.5t347 172.5q118 14 228 -19t198 -103l255 254h-134q-14 0 -23 9t-9 23v64zM576 256q185 0 316.5 131.5t131.5 316.5t-131.5 316.5
+t-316.5 131.5t-316.5 -131.5t-131.5 -316.5t131.5 -316.5t316.5 -131.5z" />
+    <glyph glyph-name="_514" unicode="&#xf225;" horiz-adv-x="1792" 
+d="M1280 1504q0 14 9 23t23 9h288q26 0 45 -19t19 -45v-288q0 -14 -9 -23t-23 -9h-64q-14 0 -23 9t-9 23v134l-254 -255q126 -158 126 -359q0 -221 -147.5 -384.5t-364.5 -187.5v-132h96q14 0 23 -9t9 -23v-64q0 -14 -9 -23t-23 -9h-96v-96q0 -14 -9 -23t-23 -9h-64
+q-14 0 -23 9t-9 23v96h-96q-14 0 -23 9t-9 23v64q0 14 9 23t23 9h96v132q-217 24 -364.5 187.5t-147.5 384.5q0 201 126 359l-52 53l-101 -111q-9 -10 -22 -10.5t-23 7.5l-48 44q-10 8 -10.5 21.5t8.5 23.5l105 115l-111 112v-134q0 -14 -9 -23t-23 -9h-64q-14 0 -23 9
+t-9 23v288q0 26 19 45t45 19h288q14 0 23 -9t9 -23v-64q0 -14 -9 -23t-23 -9h-133l106 -107l86 94q9 10 22 10.5t23 -7.5l48 -44q10 -8 10.5 -21.5t-8.5 -23.5l-90 -99l57 -56q158 126 359 126t359 -126l255 254h-134q-14 0 -23 9t-9 23v64zM832 256q185 0 316.5 131.5
+t131.5 316.5t-131.5 316.5t-316.5 131.5t-316.5 -131.5t-131.5 -316.5t131.5 -316.5t316.5 -131.5z" />
+    <glyph glyph-name="_515" unicode="&#xf226;" horiz-adv-x="1792" 
+d="M1790 1007q12 -155 -52.5 -292t-186 -224t-271.5 -103v-260h224q14 0 23 -9t9 -23v-64q0 -14 -9 -23t-23 -9h-224v-224q0 -14 -9 -23t-23 -9h-64q-14 0 -23 9t-9 23v224h-512v-224q0 -14 -9 -23t-23 -9h-64q-14 0 -23 9t-9 23v224h-224q-14 0 -23 9t-9 23v64q0 14 9 23
+t23 9h224v260q-150 16 -271.5 103t-186 224t-52.5 292q17 206 164.5 356.5t352.5 169.5q206 21 377 -94q171 115 377 94q205 -19 352.5 -169.5t164.5 -356.5zM896 647q128 131 128 313t-128 313q-128 -131 -128 -313t128 -313zM576 512q115 0 218 57q-154 165 -154 391
+q0 224 154 391q-103 57 -218 57q-185 0 -316.5 -131.5t-131.5 -316.5t131.5 -316.5t316.5 -131.5zM1152 128v260q-137 15 -256 94q-119 -79 -256 -94v-260h512zM1216 512q185 0 316.5 131.5t131.5 316.5t-131.5 316.5t-316.5 131.5q-115 0 -218 -57q154 -167 154 -391
+q0 -226 -154 -391q103 -57 218 -57z" />
+    <glyph glyph-name="_516" unicode="&#xf227;" horiz-adv-x="1920" 
+d="M1536 1120q0 14 9 23t23 9h288q26 0 45 -19t19 -45v-288q0 -14 -9 -23t-23 -9h-64q-14 0 -23 9t-9 23v134l-254 -255q76 -95 107.5 -214t9.5 -247q-31 -182 -166 -312t-318 -156q-210 -29 -384.5 80t-241.5 300q-117 6 -221 57.5t-177.5 133t-113.5 192.5t-32 230
+q9 135 78 252t182 191.5t248 89.5q118 14 227.5 -19t198.5 -103l255 254h-134q-14 0 -23 9t-9 23v64q0 14 9 23t23 9h288q26 0 45 -19t19 -45v-288q0 -14 -9 -23t-23 -9h-64q-14 0 -23 9t-9 23v134l-254 -255q59 -74 93 -169q182 -9 328 -124l255 254h-134q-14 0 -23 9
+t-9 23v64zM1024 704q0 20 -4 58q-162 -25 -271 -150t-109 -292q0 -20 4 -58q162 25 271 150t109 292zM128 704q0 -168 111 -294t276 -149q-3 29 -3 59q0 210 135 369.5t338 196.5q-53 120 -163.5 193t-245.5 73q-185 0 -316.5 -131.5t-131.5 -316.5zM1088 -128
+q185 0 316.5 131.5t131.5 316.5q0 168 -111 294t-276 149q3 -28 3 -59q0 -210 -135 -369.5t-338 -196.5q53 -120 163.5 -193t245.5 -73z" />
+    <glyph glyph-name="_517" unicode="&#xf228;" horiz-adv-x="2048" 
+d="M1664 1504q0 14 9 23t23 9h288q26 0 45 -19t19 -45v-288q0 -14 -9 -23t-23 -9h-64q-14 0 -23 9t-9 23v134l-254 -255q76 -95 107.5 -214t9.5 -247q-32 -180 -164.5 -310t-313.5 -157q-223 -34 -409 90q-117 -78 -256 -93v-132h96q14 0 23 -9t9 -23v-64q0 -14 -9 -23
+t-23 -9h-96v-96q0 -14 -9 -23t-23 -9h-64q-14 0 -23 9t-9 23v96h-96q-14 0 -23 9t-9 23v64q0 14 9 23t23 9h96v132q-155 17 -279.5 109.5t-187 237.5t-39.5 307q25 187 159.5 322.5t320.5 164.5q224 34 410 -90q146 97 320 97q201 0 359 -126l255 254h-134q-14 0 -23 9
+t-9 23v64zM896 391q128 131 128 313t-128 313q-128 -131 -128 -313t128 -313zM128 704q0 -185 131.5 -316.5t316.5 -131.5q117 0 218 57q-154 167 -154 391t154 391q-101 57 -218 57q-185 0 -316.5 -131.5t-131.5 -316.5zM1216 256q185 0 316.5 131.5t131.5 316.5
+t-131.5 316.5t-316.5 131.5q-117 0 -218 -57q154 -167 154 -391t-154 -391q101 -57 218 -57z" />
+    <glyph glyph-name="_518" unicode="&#xf229;" 
+d="M1472 1408q26 0 45 -19t19 -45v-416q0 -14 -9 -23t-23 -9h-64q-14 0 -23 9t-9 23v262l-213 -214l140 -140q9 -10 9 -23t-9 -22l-46 -46q-9 -9 -22 -9t-23 9l-140 141l-78 -79q126 -156 126 -359q0 -117 -45.5 -223.5t-123 -184t-184 -123t-223.5 -45.5t-223.5 45.5
+t-184 123t-123 184t-45.5 223.5t45.5 223.5t123 184t184 123t223.5 45.5q203 0 359 -126l78 78l-172 172q-9 10 -9 23t9 22l46 46q9 9 22 9t23 -9l172 -172l213 213h-261q-14 0 -23 9t-9 23v64q0 14 9 23t23 9h416zM576 0q185 0 316.5 131.5t131.5 316.5t-131.5 316.5
+t-316.5 131.5t-316.5 -131.5t-131.5 -316.5t131.5 -316.5t316.5 -131.5z" />
+    <glyph glyph-name="_519" unicode="&#xf22a;" horiz-adv-x="1280" 
+d="M640 892q217 -24 364.5 -187.5t147.5 -384.5q0 -167 -87 -306t-236 -212t-319 -54q-133 15 -245.5 88t-182 188t-80.5 249q-12 155 52.5 292t186 224t271.5 103v132h-160q-14 0 -23 9t-9 23v64q0 14 9 23t23 9h160v165l-92 -92q-10 -9 -23 -9t-22 9l-46 46q-9 9 -9 22
+t9 23l202 201q19 19 45 19t45 -19l202 -201q9 -10 9 -23t-9 -22l-46 -46q-9 -9 -22 -9t-23 9l-92 92v-165h160q14 0 23 -9t9 -23v-64q0 -14 -9 -23t-23 -9h-160v-132zM576 -128q185 0 316.5 131.5t131.5 316.5t-131.5 316.5t-316.5 131.5t-316.5 -131.5t-131.5 -316.5
+t131.5 -316.5t316.5 -131.5z" />
+    <glyph glyph-name="_520" unicode="&#xf22b;" horiz-adv-x="2048" 
+d="M1901 621q19 -19 19 -45t-19 -45l-294 -294q-9 -10 -22.5 -10t-22.5 10l-45 45q-10 9 -10 22.5t10 22.5l185 185h-294v-224q0 -14 -9 -23t-23 -9h-64q-14 0 -23 9t-9 23v224h-132q-24 -217 -187.5 -364.5t-384.5 -147.5q-167 0 -306 87t-212 236t-54 319q15 133 88 245.5
+t188 182t249 80.5q155 12 292 -52.5t224 -186t103 -271.5h132v224q0 14 9 23t23 9h64q14 0 23 -9t9 -23v-224h294l-185 185q-10 9 -10 22.5t10 22.5l45 45q9 10 22.5 10t22.5 -10zM576 128q185 0 316.5 131.5t131.5 316.5t-131.5 316.5t-316.5 131.5t-316.5 -131.5
+t-131.5 -316.5t131.5 -316.5t316.5 -131.5z" />
+    <glyph glyph-name="_521" unicode="&#xf22c;" horiz-adv-x="1280" 
+d="M1152 960q0 -221 -147.5 -384.5t-364.5 -187.5v-612q0 -14 -9 -23t-23 -9h-64q-14 0 -23 9t-9 23v612q-217 24 -364.5 187.5t-147.5 384.5q0 117 45.5 223.5t123 184t184 123t223.5 45.5t223.5 -45.5t184 -123t123 -184t45.5 -223.5zM576 512q185 0 316.5 131.5
+t131.5 316.5t-131.5 316.5t-316.5 131.5t-316.5 -131.5t-131.5 -316.5t131.5 -316.5t316.5 -131.5z" />
+    <glyph glyph-name="_522" unicode="&#xf22d;" horiz-adv-x="1280" 
+d="M1024 576q0 185 -131.5 316.5t-316.5 131.5t-316.5 -131.5t-131.5 -316.5t131.5 -316.5t316.5 -131.5t316.5 131.5t131.5 316.5zM1152 576q0 -117 -45.5 -223.5t-123 -184t-184 -123t-223.5 -45.5t-223.5 45.5t-184 123t-123 184t-45.5 223.5t45.5 223.5t123 184t184 123
+t223.5 45.5t223.5 -45.5t184 -123t123 -184t45.5 -223.5z" />
+    <glyph glyph-name="_523" unicode="&#xf22e;" horiz-adv-x="1792" 
+ />
+    <glyph glyph-name="_524" unicode="&#xf22f;" horiz-adv-x="1792" 
+ />
+    <glyph glyph-name="_525" unicode="&#xf230;" 
+d="M1451 1408q35 0 60 -25t25 -60v-1366q0 -35 -25 -60t-60 -25h-391v595h199l30 232h-229v148q0 56 23.5 84t91.5 28l122 1v207q-63 9 -178 9q-136 0 -217.5 -80t-81.5 -226v-171h-200v-232h200v-595h-735q-35 0 -60 25t-25 60v1366q0 35 25 60t60 25h1366z" />
+    <glyph glyph-name="_526" unicode="&#xf231;" horiz-adv-x="1280" 
+d="M0 939q0 108 37.5 203.5t103.5 166.5t152 123t185 78t202 26q158 0 294 -66.5t221 -193.5t85 -287q0 -96 -19 -188t-60 -177t-100 -149.5t-145 -103t-189 -38.5q-68 0 -135 32t-96 88q-10 -39 -28 -112.5t-23.5 -95t-20.5 -71t-26 -71t-32 -62.5t-46 -77.5t-62 -86.5
+l-14 -5l-9 10q-15 157 -15 188q0 92 21.5 206.5t66.5 287.5t52 203q-32 65 -32 169q0 83 52 156t132 73q61 0 95 -40.5t34 -102.5q0 -66 -44 -191t-44 -187q0 -63 45 -104.5t109 -41.5q55 0 102 25t78.5 68t56 95t38 110.5t20 111t6.5 99.5q0 173 -109.5 269.5t-285.5 96.5
+q-200 0 -334 -129.5t-134 -328.5q0 -44 12.5 -85t27 -65t27 -45.5t12.5 -30.5q0 -28 -15 -73t-37 -45q-2 0 -17 3q-51 15 -90.5 56t-61 94.5t-32.5 108t-11 106.5z" />
+    <glyph glyph-name="_527" unicode="&#xf232;" 
+d="M985 562q13 0 97.5 -44t89.5 -53q2 -5 2 -15q0 -33 -17 -76q-16 -39 -71 -65.5t-102 -26.5q-57 0 -190 62q-98 45 -170 118t-148 185q-72 107 -71 194v8q3 91 74 158q24 22 52 22q6 0 18 -1.5t19 -1.5q19 0 26.5 -6.5t15.5 -27.5q8 -20 33 -88t25 -75q0 -21 -34.5 -57.5
+t-34.5 -46.5q0 -7 5 -15q34 -73 102 -137q56 -53 151 -101q12 -7 22 -7q15 0 54 48.5t52 48.5zM782 32q127 0 243.5 50t200.5 134t134 200.5t50 243.5t-50 243.5t-134 200.5t-200.5 134t-243.5 50t-243.5 -50t-200.5 -134t-134 -200.5t-50 -243.5q0 -203 120 -368l-79 -233
+l242 77q158 -104 345 -104zM782 1414q153 0 292.5 -60t240.5 -161t161 -240.5t60 -292.5t-60 -292.5t-161 -240.5t-240.5 -161t-292.5 -60q-195 0 -365 94l-417 -134l136 405q-108 178 -108 389q0 153 60 292.5t161 240.5t240.5 161t292.5 60z" />
+    <glyph glyph-name="_528" unicode="&#xf233;" horiz-adv-x="1792" 
+d="M128 128h1024v128h-1024v-128zM128 640h1024v128h-1024v-128zM1696 192q0 40 -28 68t-68 28t-68 -28t-28 -68t28 -68t68 -28t68 28t28 68zM128 1152h1024v128h-1024v-128zM1696 704q0 40 -28 68t-68 28t-68 -28t-28 -68t28 -68t68 -28t68 28t28 68zM1696 1216
+q0 40 -28 68t-68 28t-68 -28t-28 -68t28 -68t68 -28t68 28t28 68zM1792 384v-384h-1792v384h1792zM1792 896v-384h-1792v384h1792zM1792 1408v-384h-1792v384h1792z" />
+    <glyph glyph-name="_529" unicode="&#xf234;" horiz-adv-x="2048" 
+d="M704 640q-159 0 -271.5 112.5t-112.5 271.5t112.5 271.5t271.5 112.5t271.5 -112.5t112.5 -271.5t-112.5 -271.5t-271.5 -112.5zM1664 512h352q13 0 22.5 -9.5t9.5 -22.5v-192q0 -13 -9.5 -22.5t-22.5 -9.5h-352v-352q0 -13 -9.5 -22.5t-22.5 -9.5h-192q-13 0 -22.5 9.5
+t-9.5 22.5v352h-352q-13 0 -22.5 9.5t-9.5 22.5v192q0 13 9.5 22.5t22.5 9.5h352v352q0 13 9.5 22.5t22.5 9.5h192q13 0 22.5 -9.5t9.5 -22.5v-352zM928 288q0 -52 38 -90t90 -38h256v-238q-68 -50 -171 -50h-874q-121 0 -194 69t-73 190q0 53 3.5 103.5t14 109t26.5 108.5
+t43 97.5t62 81t85.5 53.5t111.5 20q19 0 39 -17q79 -61 154.5 -91.5t164.5 -30.5t164.5 30.5t154.5 91.5q20 17 39 17q132 0 217 -96h-223q-52 0 -90 -38t-38 -90v-192z" />
+    <glyph glyph-name="_530" unicode="&#xf235;" horiz-adv-x="2048" 
+d="M704 640q-159 0 -271.5 112.5t-112.5 271.5t112.5 271.5t271.5 112.5t271.5 -112.5t112.5 -271.5t-112.5 -271.5t-271.5 -112.5zM1781 320l249 -249q9 -9 9 -23q0 -13 -9 -22l-136 -136q-9 -9 -22 -9q-14 0 -23 9l-249 249l-249 -249q-9 -9 -23 -9q-13 0 -22 9l-136 136
+q-9 9 -9 22q0 14 9 23l249 249l-249 249q-9 9 -9 23q0 13 9 22l136 136q9 9 22 9q14 0 23 -9l249 -249l249 249q9 9 23 9q13 0 22 -9l136 -136q9 -9 9 -22q0 -14 -9 -23zM1283 320l-181 -181q-37 -37 -37 -91q0 -53 37 -90l83 -83q-21 -3 -44 -3h-874q-121 0 -194 69
+t-73 190q0 53 3.5 103.5t14 109t26.5 108.5t43 97.5t62 81t85.5 53.5t111.5 20q19 0 39 -17q154 -122 319 -122t319 122q20 17 39 17q28 0 57 -6q-28 -27 -41 -50t-13 -56q0 -54 37 -91z" />
+    <glyph glyph-name="_531" unicode="&#xf236;" horiz-adv-x="2048" 
+d="M256 512h1728q26 0 45 -19t19 -45v-448h-256v256h-1536v-256h-256v1216q0 26 19 45t45 19h128q26 0 45 -19t19 -45v-704zM832 832q0 106 -75 181t-181 75t-181 -75t-75 -181t75 -181t181 -75t181 75t75 181zM2048 576v64q0 159 -112.5 271.5t-271.5 112.5h-704
+q-26 0 -45 -19t-19 -45v-384h1152z" />
+    <glyph glyph-name="_532" unicode="&#xf237;" 
+d="M1536 1536l-192 -448h192v-192h-274l-55 -128h329v-192h-411l-357 -832l-357 832h-411v192h329l-55 128h-274v192h192l-192 448h256l323 -768h378l323 768h256zM768 320l108 256h-216z" />
+    <glyph glyph-name="_533" unicode="&#xf238;" 
+d="M1088 1536q185 0 316.5 -93.5t131.5 -226.5v-896q0 -130 -125.5 -222t-305.5 -97l213 -202q16 -15 8 -35t-30 -20h-1056q-22 0 -30 20t8 35l213 202q-180 5 -305.5 97t-125.5 222v896q0 133 131.5 226.5t316.5 93.5h640zM768 192q80 0 136 56t56 136t-56 136t-136 56
+t-136 -56t-56 -136t56 -136t136 -56zM1344 768v512h-1152v-512h1152z" />
+    <glyph glyph-name="_534" unicode="&#xf239;" 
+d="M1088 1536q185 0 316.5 -93.5t131.5 -226.5v-896q0 -130 -125.5 -222t-305.5 -97l213 -202q16 -15 8 -35t-30 -20h-1056q-22 0 -30 20t8 35l213 202q-180 5 -305.5 97t-125.5 222v896q0 133 131.5 226.5t316.5 93.5h640zM288 224q66 0 113 47t47 113t-47 113t-113 47
+t-113 -47t-47 -113t47 -113t113 -47zM704 768v512h-544v-512h544zM1248 224q66 0 113 47t47 113t-47 113t-113 47t-113 -47t-47 -113t47 -113t113 -47zM1408 768v512h-576v-512h576z" />
+    <glyph glyph-name="_535" unicode="&#xf23a;" horiz-adv-x="1792" 
+d="M597 1115v-1173q0 -25 -12.5 -42.5t-36.5 -17.5q-17 0 -33 8l-465 233q-21 10 -35.5 33.5t-14.5 46.5v1140q0 20 10 34t29 14q14 0 44 -15l511 -256q3 -3 3 -5zM661 1014l534 -866l-534 266v600zM1792 996v-1054q0 -25 -14 -40.5t-38 -15.5t-47 13l-441 220zM1789 1116
+q0 -3 -256.5 -419.5t-300.5 -487.5l-390 634l324 527q17 28 52 28q14 0 26 -6l541 -270q4 -2 4 -6z" />
+    <glyph glyph-name="_536" unicode="&#xf23b;" 
+d="M809 532l266 499h-112l-157 -312q-24 -48 -44 -92l-42 92l-155 312h-120l263 -493v-324h101v318zM1536 1408v-1536h-1536v1536h1536z" />
+    <glyph glyph-name="_537" unicode="&#xf23c;" horiz-adv-x="2296" 
+d="M478 -139q-8 -16 -27 -34.5t-37 -25.5q-25 -9 -51.5 3.5t-28.5 31.5q-1 22 40 55t68 38q23 4 34 -21.5t2 -46.5zM1819 -139q7 -16 26 -34.5t38 -25.5q25 -9 51.5 3.5t27.5 31.5q2 22 -39.5 55t-68.5 38q-22 4 -33 -21.5t-2 -46.5zM1867 -30q13 -27 56.5 -59.5t77.5 -41.5
+q45 -13 82 4.5t37 50.5q0 46 -67.5 100.5t-115.5 59.5q-40 5 -63.5 -37.5t-6.5 -76.5zM428 -30q-13 -27 -56 -59.5t-77 -41.5q-45 -13 -82 4.5t-37 50.5q0 46 67.5 100.5t115.5 59.5q40 5 63 -37.5t6 -76.5zM1158 1094h1q-41 0 -76 -15q27 -8 44 -30.5t17 -49.5
+q0 -35 -27 -60t-65 -25q-52 0 -80 43q-5 -23 -5 -42q0 -74 56 -126.5t135 -52.5q80 0 136 52.5t56 126.5t-56 126.5t-136 52.5zM1462 1312q-99 109 -220.5 131.5t-245.5 -44.5q27 60 82.5 96.5t118 39.5t121.5 -17t99.5 -74.5t44.5 -131.5zM2212 73q8 -11 -11 -42
+q7 -23 7 -40q1 -56 -44.5 -112.5t-109.5 -91.5t-118 -37q-48 -2 -92 21.5t-66 65.5q-687 -25 -1259 0q-23 -41 -66.5 -65t-92.5 -22q-86 3 -179.5 80.5t-92.5 160.5q2 22 7 40q-19 31 -11 42q6 10 31 1q14 22 41 51q-7 29 2 38q11 10 39 -4q29 20 59 34q0 29 13 37
+q23 12 51 -16q35 5 61 -2q18 -4 38 -19v73q-11 0 -18 2q-53 10 -97 44.5t-55 87.5q-9 38 0 81q15 62 93 95q2 17 19 35.5t36 23.5t33 -7.5t19 -30.5h13q46 -5 60 -23q3 -3 5 -7q10 1 30.5 3.5t30.5 3.5q-15 11 -30 17q-23 40 -91 43q0 6 1 10q-62 2 -118.5 18.5t-84.5 47.5
+q-32 36 -42.5 92t-2.5 112q16 126 90 179q23 16 52 4.5t32 -40.5q0 -1 1.5 -14t2.5 -21t3 -20t5.5 -19t8.5 -10q27 -14 76 -12q48 46 98 74q-40 4 -162 -14l47 46q61 58 163 111q145 73 282 86q-20 8 -41 15.5t-47 14t-42.5 10.5t-47.5 11t-43 10q595 126 904 -139
+q98 -84 158 -222q85 -10 121 9h1q5 3 8.5 10t5.5 19t3 19.5t3 21.5l1 14q3 28 32 40t52 -5q73 -52 91 -178q7 -57 -3.5 -113t-42.5 -91q-28 -32 -83.5 -48.5t-115.5 -18.5v-10q-71 -2 -95 -43q-14 -5 -31 -17q11 -1 32 -3.5t30 -3.5q1 5 5 8q16 18 60 23h13q5 18 19 30t33 8
+t36 -23t19 -36q79 -32 93 -95q9 -40 1 -81q-12 -53 -56 -88t-97 -44q-10 -2 -17 -2q0 -49 -1 -73q20 15 38 19q26 7 61 2q28 28 51 16q14 -9 14 -37q33 -16 59 -34q27 13 38 4q10 -10 2 -38q28 -30 41 -51q23 8 31 -1zM1937 1025q0 -29 -9 -54q82 -32 112 -132
+q4 37 -9.5 98.5t-41.5 90.5q-20 19 -36 17t-16 -20zM1859 925q35 -42 47.5 -108.5t-0.5 -124.5q67 13 97 45q13 14 18 28q-3 64 -31 114.5t-79 66.5q-15 -15 -52 -21zM1822 921q-30 0 -44 1q42 -115 53 -239q21 0 43 3q16 68 1 135t-53 100zM258 839q30 100 112 132
+q-9 25 -9 54q0 18 -16.5 20t-35.5 -17q-28 -29 -41.5 -90.5t-9.5 -98.5zM294 737q29 -31 97 -45q-13 58 -0.5 124.5t47.5 108.5v0q-37 6 -52 21q-51 -16 -78.5 -66t-31.5 -115q9 -17 18 -28zM471 683q14 124 73 235q-19 -4 -55 -18l-45 -19v1q-46 -89 -20 -196q25 -3 47 -3z
+M1434 644q8 -38 16.5 -108.5t11.5 -89.5q3 -18 9.5 -21.5t23.5 4.5q40 20 62 85.5t23 125.5q-24 2 -146 4zM1152 1285q-116 0 -199 -82.5t-83 -198.5q0 -117 83 -199.5t199 -82.5t199 82.5t83 199.5q0 116 -83 198.5t-199 82.5zM1380 646q-105 2 -211 0v1q-1 -27 2.5 -86
+t13.5 -66q29 -14 93.5 -14.5t95.5 10.5q9 3 11 39t-0.5 69.5t-4.5 46.5zM1112 447q8 4 9.5 48t-0.5 88t-4 63v1q-212 -3 -214 -3q-4 -20 -7 -62t0 -83t14 -46q34 -15 101 -16t101 10zM718 636q-16 -59 4.5 -118.5t77.5 -84.5q15 -8 24 -5t12 21q3 16 8 90t10 103
+q-69 -2 -136 -6zM591 510q3 -23 -34 -36q132 -141 271.5 -240t305.5 -154q172 49 310.5 146t293.5 250q-33 13 -30 34q0 2 0.5 3.5t1.5 3t1 2.5v1v-1q-17 2 -50 5.5t-48 4.5q-26 -90 -82 -132q-51 -38 -82 1q-5 6 -9 14q-7 13 -17 62q-2 -5 -5 -9t-7.5 -7t-8 -5.5t-9.5 -4
+l-10 -2.5t-12 -2l-12 -1.5t-13.5 -1t-13.5 -0.5q-106 -9 -163 11q-4 -17 -10 -26.5t-21 -15t-23 -7t-36 -3.5q-6 -1 -9 -1q-179 -17 -203 40q-2 -63 -56 -54q-47 8 -91 54q-12 13 -20 26q-17 29 -26 65q-58 -6 -87 -10q1 -2 4 -10zM507 -118q3 14 3 30q-17 71 -51 130
+t-73 70q-41 12 -101.5 -14.5t-104.5 -80t-39 -107.5q35 -53 100 -93t119 -42q51 -2 94 28t53 79zM510 53q23 -63 27 -119q195 113 392 174q-98 52 -180.5 120t-179.5 165q-6 -4 -29 -13q0 -1 -1 -4t-1 -5q31 -18 22 -37q-12 -23 -56 -34q-10 -13 -29 -24h-1q-2 -83 1 -150
+q19 -34 35 -73zM579 -113q532 -21 1145 0q-254 147 -428 196q-76 -35 -156 -57q-8 -3 -16 0q-65 21 -129 49q-208 -60 -416 -188h-1v-1q1 0 1 1zM1763 -67q4 54 28 120q14 38 33 71l-1 -1q3 77 3 153q-15 8 -30 25q-42 9 -56 33q-9 20 22 38q-2 4 -2 9q-16 4 -28 12
+q-204 -190 -383 -284q198 -59 414 -176zM2155 -90q5 54 -39 107.5t-104 80t-102 14.5q-38 -11 -72.5 -70.5t-51.5 -129.5q0 -16 3 -30q10 -49 53 -79t94 -28q54 2 119 42t100 93z" />
+    <glyph glyph-name="_538" unicode="&#xf23d;" horiz-adv-x="2304" 
+d="M1524 -25q0 -68 -48 -116t-116 -48t-116.5 48t-48.5 116t48.5 116.5t116.5 48.5t116 -48.5t48 -116.5zM775 -25q0 -68 -48.5 -116t-116.5 -48t-116 48t-48 116t48 116.5t116 48.5t116.5 -48.5t48.5 -116.5zM0 1469q57 -60 110.5 -104.5t121 -82t136 -63t166 -45.5
+t200 -31.5t250 -18.5t304 -9.5t372.5 -2.5q139 0 244.5 -5t181 -16.5t124 -27.5t71 -39.5t24 -51.5t-19.5 -64t-56.5 -76.5t-89.5 -91t-116 -104.5t-139 -119q-185 -157 -286 -247q29 51 76.5 109t94 105.5t94.5 98.5t83 91.5t54 80.5t13 70t-45.5 55.5t-116.5 41t-204 23.5
+t-304 5q-168 -2 -314 6t-256 23t-204.5 41t-159.5 51.5t-122.5 62.5t-91.5 66.5t-68 71.5t-50.5 69.5t-40 68t-36.5 59.5z" />
+    <glyph glyph-name="_539" unicode="&#xf23e;" horiz-adv-x="1792" 
+d="M896 1472q-169 0 -323 -66t-265.5 -177.5t-177.5 -265.5t-66 -323t66 -323t177.5 -265.5t265.5 -177.5t323 -66t323 66t265.5 177.5t177.5 265.5t66 323t-66 323t-177.5 265.5t-265.5 177.5t-323 66zM896 1536q182 0 348 -71t286 -191t191 -286t71 -348t-71 -348
+t-191 -286t-286 -191t-348 -71t-348 71t-286 191t-191 286t-71 348t71 348t191 286t286 191t348 71zM496 704q16 0 16 -16v-480q0 -16 -16 -16h-32q-16 0 -16 16v480q0 16 16 16h32zM896 640q53 0 90.5 -37.5t37.5 -90.5q0 -35 -17.5 -64t-46.5 -46v-114q0 -14 -9 -23
+t-23 -9h-64q-14 0 -23 9t-9 23v114q-29 17 -46.5 46t-17.5 64q0 53 37.5 90.5t90.5 37.5zM896 1408q209 0 385.5 -103t279.5 -279.5t103 -385.5t-103 -385.5t-279.5 -279.5t-385.5 -103t-385.5 103t-279.5 279.5t-103 385.5t103 385.5t279.5 279.5t385.5 103zM544 928v-96
+q0 -14 9 -23t23 -9h64q14 0 23 9t9 23v96q0 93 65.5 158.5t158.5 65.5t158.5 -65.5t65.5 -158.5v-96q0 -14 9 -23t23 -9h64q14 0 23 9t9 23v96q0 146 -103 249t-249 103t-249 -103t-103 -249zM1408 192v512q0 26 -19 45t-45 19h-896q-26 0 -45 -19t-19 -45v-512
+q0 -26 19 -45t45 -19h896q26 0 45 19t19 45z" />
+    <glyph glyph-name="_540" unicode="&#xf240;" horiz-adv-x="2304" 
+d="M1920 1024v-768h-1664v768h1664zM2048 448h128v384h-128v288q0 14 -9 23t-23 9h-1856q-14 0 -23 -9t-9 -23v-960q0 -14 9 -23t23 -9h1856q14 0 23 9t9 23v288zM2304 832v-384q0 -53 -37.5 -90.5t-90.5 -37.5v-160q0 -66 -47 -113t-113 -47h-1856q-66 0 -113 47t-47 113
+v960q0 66 47 113t113 47h1856q66 0 113 -47t47 -113v-160q53 0 90.5 -37.5t37.5 -90.5z" />
+    <glyph glyph-name="_541" unicode="&#xf241;" horiz-adv-x="2304" 
+d="M256 256v768h1280v-768h-1280zM2176 960q53 0 90.5 -37.5t37.5 -90.5v-384q0 -53 -37.5 -90.5t-90.5 -37.5v-160q0 -66 -47 -113t-113 -47h-1856q-66 0 -113 47t-47 113v960q0 66 47 113t113 47h1856q66 0 113 -47t47 -113v-160zM2176 448v384h-128v288q0 14 -9 23t-23 9
+h-1856q-14 0 -23 -9t-9 -23v-960q0 -14 9 -23t23 -9h1856q14 0 23 9t9 23v288h128z" />
+    <glyph glyph-name="_542" unicode="&#xf242;" horiz-adv-x="2304" 
+d="M256 256v768h896v-768h-896zM2176 960q53 0 90.5 -37.5t37.5 -90.5v-384q0 -53 -37.5 -90.5t-90.5 -37.5v-160q0 -66 -47 -113t-113 -47h-1856q-66 0 -113 47t-47 113v960q0 66 47 113t113 47h1856q66 0 113 -47t47 -113v-160zM2176 448v384h-128v288q0 14 -9 23t-23 9
+h-1856q-14 0 -23 -9t-9 -23v-960q0 -14 9 -23t23 -9h1856q14 0 23 9t9 23v288h128z" />
+    <glyph glyph-name="_543" unicode="&#xf243;" horiz-adv-x="2304" 
+d="M256 256v768h512v-768h-512zM2176 960q53 0 90.5 -37.5t37.5 -90.5v-384q0 -53 -37.5 -90.5t-90.5 -37.5v-160q0 -66 -47 -113t-113 -47h-1856q-66 0 -113 47t-47 113v960q0 66 47 113t113 47h1856q66 0 113 -47t47 -113v-160zM2176 448v384h-128v288q0 14 -9 23t-23 9
+h-1856q-14 0 -23 -9t-9 -23v-960q0 -14 9 -23t23 -9h1856q14 0 23 9t9 23v288h128z" />
+    <glyph glyph-name="_544" unicode="&#xf244;" horiz-adv-x="2304" 
+d="M2176 960q53 0 90.5 -37.5t37.5 -90.5v-384q0 -53 -37.5 -90.5t-90.5 -37.5v-160q0 -66 -47 -113t-113 -47h-1856q-66 0 -113 47t-47 113v960q0 66 47 113t113 47h1856q66 0 113 -47t47 -113v-160zM2176 448v384h-128v288q0 14 -9 23t-23 9h-1856q-14 0 -23 -9t-9 -23
+v-960q0 -14 9 -23t23 -9h1856q14 0 23 9t9 23v288h128z" />
+    <glyph glyph-name="_545" unicode="&#xf245;" horiz-adv-x="1280" 
+d="M1133 493q31 -30 14 -69q-17 -40 -59 -40h-382l201 -476q10 -25 0 -49t-34 -35l-177 -75q-25 -10 -49 0t-35 34l-191 452l-312 -312q-19 -19 -45 -19q-12 0 -24 5q-40 17 -40 59v1504q0 42 40 59q12 5 24 5q27 0 45 -19z" />
+    <glyph glyph-name="_546" unicode="&#xf246;" horiz-adv-x="1024" 
+d="M832 1408q-320 0 -320 -224v-416h128v-128h-128v-544q0 -224 320 -224h64v-128h-64q-272 0 -384 146q-112 -146 -384 -146h-64v128h64q320 0 320 224v544h-128v128h128v416q0 224 -320 224h-64v128h64q272 0 384 -146q112 146 384 146h64v-128h-64z" />
+    <glyph glyph-name="_547" unicode="&#xf247;" horiz-adv-x="2048" 
+d="M2048 1152h-128v-1024h128v-384h-384v128h-1280v-128h-384v384h128v1024h-128v384h384v-128h1280v128h384v-384zM1792 1408v-128h128v128h-128zM128 1408v-128h128v128h-128zM256 -128v128h-128v-128h128zM1664 0v128h128v1024h-128v128h-1280v-128h-128v-1024h128v-128
+h1280zM1920 -128v128h-128v-128h128zM1280 896h384v-768h-896v256h-384v768h896v-256zM512 512h640v512h-640v-512zM1536 256v512h-256v-384h-384v-128h640z" />
+    <glyph glyph-name="_548" unicode="&#xf248;" horiz-adv-x="2304" 
+d="M2304 768h-128v-640h128v-384h-384v128h-896v-128h-384v384h128v128h-384v-128h-384v384h128v640h-128v384h384v-128h896v128h384v-384h-128v-128h384v128h384v-384zM2048 1024v-128h128v128h-128zM1408 1408v-128h128v128h-128zM128 1408v-128h128v128h-128zM256 256
+v128h-128v-128h128zM1536 384h-128v-128h128v128zM384 384h896v128h128v640h-128v128h-896v-128h-128v-640h128v-128zM896 -128v128h-128v-128h128zM2176 -128v128h-128v-128h128zM2048 128v640h-128v128h-384v-384h128v-384h-384v128h-384v-128h128v-128h896v128h128z" />
+    <glyph glyph-name="_549" unicode="&#xf249;" 
+d="M1024 288v-416h-928q-40 0 -68 28t-28 68v1344q0 40 28 68t68 28h1344q40 0 68 -28t28 -68v-928h-416q-40 0 -68 -28t-28 -68zM1152 256h381q-15 -82 -65 -132l-184 -184q-50 -50 -132 -65v381z" />
+    <glyph glyph-name="_550" unicode="&#xf24a;" 
+d="M1400 256h-248v-248q29 10 41 22l185 185q12 12 22 41zM1120 384h288v896h-1280v-1280h896v288q0 40 28 68t68 28zM1536 1312v-1024q0 -40 -20 -88t-48 -76l-184 -184q-28 -28 -76 -48t-88 -20h-1024q-40 0 -68 28t-28 68v1344q0 40 28 68t68 28h1344q40 0 68 -28t28 -68
+z" />
+    <glyph glyph-name="_551" unicode="&#xf24b;" horiz-adv-x="2304" 
+d="M1951 538q0 -26 -15.5 -44.5t-38.5 -23.5q-8 -2 -18 -2h-153v140h153q10 0 18 -2q23 -5 38.5 -23.5t15.5 -44.5zM1933 751q0 -25 -15 -42t-38 -21q-3 -1 -15 -1h-139v129h139q3 0 8.5 -0.5t6.5 -0.5q23 -4 38 -21.5t15 -42.5zM728 587v308h-228v-308q0 -58 -38 -94.5
+t-105 -36.5q-108 0 -229 59v-112q53 -15 121 -23t109 -9l42 -1q328 0 328 217zM1442 403v113q-99 -52 -200 -59q-108 -8 -169 41t-61 142t61 142t169 41q101 -7 200 -58v112q-48 12 -100 19.5t-80 9.5l-28 2q-127 6 -218.5 -14t-140.5 -60t-71 -88t-22 -106t22 -106t71 -88
+t140.5 -60t218.5 -14q101 4 208 31zM2176 518q0 54 -43 88.5t-109 39.5v3q57 8 89 41.5t32 79.5q0 55 -41 88t-107 36q-3 0 -12 0.5t-14 0.5h-455v-510h491q74 0 121.5 36.5t47.5 96.5zM2304 1280v-1280q0 -52 -38 -90t-90 -38h-2048q-52 0 -90 38t-38 90v1280q0 52 38 90
+t90 38h2048q52 0 90 -38t38 -90z" />
+    <glyph glyph-name="_552" unicode="&#xf24c;" horiz-adv-x="2304" 
+d="M858 295v693q-106 -41 -172 -135.5t-66 -211.5t66 -211.5t172 -134.5zM1362 641q0 117 -66 211.5t-172 135.5v-694q106 41 172 135.5t66 211.5zM1577 641q0 -159 -78.5 -294t-213.5 -213.5t-294 -78.5q-119 0 -227.5 46.5t-187 125t-125 187t-46.5 227.5q0 159 78.5 294
+t213.5 213.5t294 78.5t294 -78.5t213.5 -213.5t78.5 -294zM1960 634q0 139 -55.5 261.5t-147.5 205.5t-213.5 131t-252.5 48h-301q-176 0 -323.5 -81t-235 -230t-87.5 -335q0 -171 87 -317.5t236 -231.5t323 -85h301q129 0 251.5 50.5t214.5 135t147.5 202.5t55.5 246z
+M2304 1280v-1280q0 -52 -38 -90t-90 -38h-2048q-52 0 -90 38t-38 90v1280q0 52 38 90t90 38h2048q52 0 90 -38t38 -90z" />
+    <glyph glyph-name="_553" unicode="&#xf24d;" horiz-adv-x="1792" 
+d="M1664 -96v1088q0 13 -9.5 22.5t-22.5 9.5h-1088q-13 0 -22.5 -9.5t-9.5 -22.5v-1088q0 -13 9.5 -22.5t22.5 -9.5h1088q13 0 22.5 9.5t9.5 22.5zM1792 992v-1088q0 -66 -47 -113t-113 -47h-1088q-66 0 -113 47t-47 113v1088q0 66 47 113t113 47h1088q66 0 113 -47t47 -113
+zM1408 1376v-160h-128v160q0 13 -9.5 22.5t-22.5 9.5h-1088q-13 0 -22.5 -9.5t-9.5 -22.5v-1088q0 -13 9.5 -22.5t22.5 -9.5h160v-128h-160q-66 0 -113 47t-47 113v1088q0 66 47 113t113 47h1088q66 0 113 -47t47 -113z" />
+    <glyph glyph-name="_554" unicode="&#xf24e;" horiz-adv-x="2304" 
+d="M1728 1088l-384 -704h768zM448 1088l-384 -704h768zM1269 1280q-14 -40 -45.5 -71.5t-71.5 -45.5v-1291h608q14 0 23 -9t9 -23v-64q0 -14 -9 -23t-23 -9h-1344q-14 0 -23 9t-9 23v64q0 14 9 23t23 9h608v1291q-40 14 -71.5 45.5t-45.5 71.5h-491q-14 0 -23 9t-9 23v64
+q0 14 9 23t23 9h491q21 57 70 92.5t111 35.5t111 -35.5t70 -92.5h491q14 0 23 -9t9 -23v-64q0 -14 -9 -23t-23 -9h-491zM1088 1264q33 0 56.5 23.5t23.5 56.5t-23.5 56.5t-56.5 23.5t-56.5 -23.5t-23.5 -56.5t23.5 -56.5t56.5 -23.5zM2176 384q0 -73 -46.5 -131t-117.5 -91
+t-144.5 -49.5t-139.5 -16.5t-139.5 16.5t-144.5 49.5t-117.5 91t-46.5 131q0 11 35 81t92 174.5t107 195.5t102 184t56 100q18 33 56 33t56 -33q4 -7 56 -100t102 -184t107 -195.5t92 -174.5t35 -81zM896 384q0 -73 -46.5 -131t-117.5 -91t-144.5 -49.5t-139.5 -16.5
+t-139.5 16.5t-144.5 49.5t-117.5 91t-46.5 131q0 11 35 81t92 174.5t107 195.5t102 184t56 100q18 33 56 33t56 -33q4 -7 56 -100t102 -184t107 -195.5t92 -174.5t35 -81z" />
+    <glyph glyph-name="_555" unicode="&#xf250;" 
+d="M1408 1408q0 -261 -106.5 -461.5t-266.5 -306.5q160 -106 266.5 -306.5t106.5 -461.5h96q14 0 23 -9t9 -23v-64q0 -14 -9 -23t-23 -9h-1472q-14 0 -23 9t-9 23v64q0 14 9 23t23 9h96q0 261 106.5 461.5t266.5 306.5q-160 106 -266.5 306.5t-106.5 461.5h-96q-14 0 -23 9
+t-9 23v64q0 14 9 23t23 9h1472q14 0 23 -9t9 -23v-64q0 -14 -9 -23t-23 -9h-96zM874 700q77 29 149 92.5t129.5 152.5t92.5 210t35 253h-1024q0 -132 35 -253t92.5 -210t129.5 -152.5t149 -92.5q19 -7 30.5 -23.5t11.5 -36.5t-11.5 -36.5t-30.5 -23.5q-77 -29 -149 -92.5
+t-129.5 -152.5t-92.5 -210t-35 -253h1024q0 132 -35 253t-92.5 210t-129.5 152.5t-149 92.5q-19 7 -30.5 23.5t-11.5 36.5t11.5 36.5t30.5 23.5z" />
+    <glyph glyph-name="_556" unicode="&#xf251;" 
+d="M1408 1408q0 -261 -106.5 -461.5t-266.5 -306.5q160 -106 266.5 -306.5t106.5 -461.5h96q14 0 23 -9t9 -23v-64q0 -14 -9 -23t-23 -9h-1472q-14 0 -23 9t-9 23v64q0 14 9 23t23 9h96q0 261 106.5 461.5t266.5 306.5q-160 106 -266.5 306.5t-106.5 461.5h-96q-14 0 -23 9
+t-9 23v64q0 14 9 23t23 9h1472q14 0 23 -9t9 -23v-64q0 -14 -9 -23t-23 -9h-96zM1280 1408h-1024q0 -66 9 -128h1006q9 61 9 128zM1280 -128q0 130 -34 249.5t-90.5 208t-126.5 152t-146 94.5h-230q-76 -31 -146 -94.5t-126.5 -152t-90.5 -208t-34 -249.5h1024z" />
+    <glyph glyph-name="_557" unicode="&#xf252;" 
+d="M1408 1408q0 -261 -106.5 -461.5t-266.5 -306.5q160 -106 266.5 -306.5t106.5 -461.5h96q14 0 23 -9t9 -23v-64q0 -14 -9 -23t-23 -9h-1472q-14 0 -23 9t-9 23v64q0 14 9 23t23 9h96q0 261 106.5 461.5t266.5 306.5q-160 106 -266.5 306.5t-106.5 461.5h-96q-14 0 -23 9
+t-9 23v64q0 14 9 23t23 9h1472q14 0 23 -9t9 -23v-64q0 -14 -9 -23t-23 -9h-96zM1280 1408h-1024q0 -206 85 -384h854q85 178 85 384zM1223 192q-54 141 -145.5 241.5t-194.5 142.5h-230q-103 -42 -194.5 -142.5t-145.5 -241.5h910z" />
+    <glyph glyph-name="_558" unicode="&#xf253;" 
+d="M1408 1408q0 -261 -106.5 -461.5t-266.5 -306.5q160 -106 266.5 -306.5t106.5 -461.5h96q14 0 23 -9t9 -23v-64q0 -14 -9 -23t-23 -9h-1472q-14 0 -23 9t-9 23v64q0 14 9 23t23 9h96q0 261 106.5 461.5t266.5 306.5q-160 106 -266.5 306.5t-106.5 461.5h-96q-14 0 -23 9
+t-9 23v64q0 14 9 23t23 9h1472q14 0 23 -9t9 -23v-64q0 -14 -9 -23t-23 -9h-96zM874 700q77 29 149 92.5t129.5 152.5t92.5 210t35 253h-1024q0 -132 35 -253t92.5 -210t129.5 -152.5t149 -92.5q19 -7 30.5 -23.5t11.5 -36.5t-11.5 -36.5t-30.5 -23.5q-137 -51 -244 -196
+h700q-107 145 -244 196q-19 7 -30.5 23.5t-11.5 36.5t11.5 36.5t30.5 23.5z" />
+    <glyph glyph-name="_559" unicode="&#xf254;" 
+d="M1504 -64q14 0 23 -9t9 -23v-128q0 -14 -9 -23t-23 -9h-1472q-14 0 -23 9t-9 23v128q0 14 9 23t23 9h1472zM130 0q3 55 16 107t30 95t46 87t53.5 76t64.5 69.5t66 60t70.5 55t66.5 47.5t65 43q-43 28 -65 43t-66.5 47.5t-70.5 55t-66 60t-64.5 69.5t-53.5 76t-46 87
+t-30 95t-16 107h1276q-3 -55 -16 -107t-30 -95t-46 -87t-53.5 -76t-64.5 -69.5t-66 -60t-70.5 -55t-66.5 -47.5t-65 -43q43 -28 65 -43t66.5 -47.5t70.5 -55t66 -60t64.5 -69.5t53.5 -76t46 -87t30 -95t16 -107h-1276zM1504 1536q14 0 23 -9t9 -23v-128q0 -14 -9 -23t-23 -9
+h-1472q-14 0 -23 9t-9 23v128q0 14 9 23t23 9h1472z" />
+    <glyph glyph-name="_560" unicode="&#xf255;" 
+d="M768 1152q-53 0 -90.5 -37.5t-37.5 -90.5v-128h-32v93q0 48 -32 81.5t-80 33.5q-46 0 -79 -33t-33 -79v-429l-32 30v172q0 48 -32 81.5t-80 33.5q-46 0 -79 -33t-33 -79v-224q0 -47 35 -82l310 -296q39 -39 39 -102q0 -26 19 -45t45 -19h640q26 0 45 19t19 45v25
+q0 41 10 77l108 436q10 36 10 77v246q0 48 -32 81.5t-80 33.5q-46 0 -79 -33t-33 -79v-32h-32v125q0 40 -25 72.5t-64 40.5q-14 2 -23 2q-46 0 -79 -33t-33 -79v-128h-32v122q0 51 -32.5 89.5t-82.5 43.5q-5 1 -13 1zM768 1280q84 0 149 -50q57 34 123 34q59 0 111 -27
+t86 -76q27 7 59 7q100 0 170 -71.5t70 -171.5v-246q0 -51 -13 -108l-109 -436q-6 -24 -6 -71q0 -80 -56 -136t-136 -56h-640q-84 0 -138 58.5t-54 142.5l-308 296q-76 73 -76 175v224q0 99 70.5 169.5t169.5 70.5q11 0 16 -1q6 95 75.5 160t164.5 65q52 0 98 -21
+q72 69 174 69z" />
+    <glyph glyph-name="_561" unicode="&#xf256;" horiz-adv-x="1792" 
+d="M880 1408q-46 0 -79 -33t-33 -79v-656h-32v528q0 46 -33 79t-79 33t-79 -33t-33 -79v-528v-256l-154 205q-38 51 -102 51q-53 0 -90.5 -37.5t-37.5 -90.5q0 -43 26 -77l384 -512q38 -51 102 -51h688q34 0 61 22t34 56l76 405q5 32 5 59v498q0 46 -33 79t-79 33t-79 -33
+t-33 -79v-272h-32v528q0 46 -33 79t-79 33t-79 -33t-33 -79v-528h-32v656q0 46 -33 79t-79 33zM880 1536q68 0 125.5 -35.5t88.5 -96.5q19 4 42 4q99 0 169.5 -70.5t70.5 -169.5v-17q105 6 180.5 -64t75.5 -175v-498q0 -40 -8 -83l-76 -404q-14 -79 -76.5 -131t-143.5 -52
+h-688q-60 0 -114.5 27.5t-90.5 74.5l-384 512q-51 68 -51 154q0 106 75 181t181 75q78 0 128 -34v434q0 99 70.5 169.5t169.5 70.5q23 0 42 -4q31 61 88.5 96.5t125.5 35.5z" />
+    <glyph glyph-name="_562" unicode="&#xf257;" horiz-adv-x="1792" 
+d="M1073 -128h-177q-163 0 -226 141q-23 49 -23 102v5q-62 30 -98.5 88.5t-36.5 127.5q0 38 5 48h-261q-106 0 -181 75t-75 181t75 181t181 75h113l-44 17q-74 28 -119.5 93.5t-45.5 145.5q0 106 75 181t181 75q46 0 91 -17l628 -239h401q106 0 181 -75t75 -181v-668
+q0 -88 -54 -157.5t-140 -90.5l-339 -85q-92 -23 -186 -23zM1024 583l-155 -71l-163 -74q-30 -14 -48 -41.5t-18 -60.5q0 -46 33 -79t79 -33q26 0 46 10l338 154q-49 10 -80.5 50t-31.5 90v55zM1344 272q0 46 -33 79t-79 33q-26 0 -46 -10l-290 -132q-28 -13 -37 -17
+t-30.5 -17t-29.5 -23.5t-16 -29t-8 -40.5q0 -50 31.5 -82t81.5 -32q20 0 38 9l352 160q30 14 48 41.5t18 60.5zM1112 1024l-650 248q-24 8 -46 8q-53 0 -90.5 -37.5t-37.5 -90.5q0 -40 22.5 -73t59.5 -47l526 -200v-64h-640q-53 0 -90.5 -37.5t-37.5 -90.5t37.5 -90.5
+t90.5 -37.5h535l233 106v198q0 63 46 106l111 102h-69zM1073 0q82 0 155 19l339 85q43 11 70 45.5t27 78.5v668q0 53 -37.5 90.5t-90.5 37.5h-308l-136 -126q-36 -33 -36 -82v-296q0 -46 33 -77t79 -31t79 35t33 81v208h32v-208q0 -70 -57 -114q52 -8 86.5 -48.5t34.5 -93.5
+q0 -42 -23 -78t-61 -53l-310 -141h91z" />
+    <glyph glyph-name="_563" unicode="&#xf258;" horiz-adv-x="2048" 
+d="M1151 1536q61 0 116 -28t91 -77l572 -781q118 -159 118 -359v-355q0 -80 -56 -136t-136 -56h-384q-80 0 -136 56t-56 136v177l-286 143h-546q-80 0 -136 56t-56 136v32q0 119 84.5 203.5t203.5 84.5h420l42 128h-686q-100 0 -173.5 67.5t-81.5 166.5q-65 79 -65 182v32
+q0 80 56 136t136 56h959zM1920 -64v355q0 157 -93 284l-573 781q-39 52 -103 52h-959q-26 0 -45 -19t-19 -45q0 -32 1.5 -49.5t9.5 -40.5t25 -43q10 31 35.5 50t56.5 19h832v-32h-832q-26 0 -45 -19t-19 -45q0 -44 3 -58q8 -44 44 -73t81 -29h640h91q40 0 68 -28t28 -68
+q0 -15 -5 -30l-64 -192q-10 -29 -35 -47.5t-56 -18.5h-443q-66 0 -113 -47t-47 -113v-32q0 -26 19 -45t45 -19h561q16 0 29 -7l317 -158q24 -13 38.5 -36t14.5 -50v-197q0 -26 19 -45t45 -19h384q26 0 45 19t19 45z" />
+    <glyph glyph-name="_564" unicode="&#xf259;" horiz-adv-x="2048" 
+d="M459 -256q-77 0 -137.5 47.5t-79.5 122.5l-101 401q-13 57 -13 108q0 45 -5 67l-116 477q-7 27 -7 57q0 93 62 161t155 78q17 85 82.5 139t152.5 54q83 0 148 -51.5t85 -132.5l83 -348l103 428q20 81 85 132.5t148 51.5q89 0 155.5 -57.5t80.5 -144.5q92 -10 152 -79
+t60 -162q0 -24 -7 -59l-123 -512q10 7 37.5 28.5t38.5 29.5t35 23t41 20.5t41.5 11t49.5 5.5q105 0 180 -74t75 -179q0 -62 -28.5 -118t-78.5 -94l-507 -380q-68 -51 -153 -51h-694zM1104 1408q-38 0 -68.5 -24t-39.5 -62l-164 -682h-127l-145 602q-9 38 -39.5 62t-68.5 24
+q-48 0 -80 -33t-32 -80q0 -15 3 -28l132 -547h-26l-99 408q-9 37 -40 62.5t-69 25.5q-47 0 -80 -33t-33 -79q0 -14 3 -26l116 -478q7 -28 9 -86t10 -88l100 -401q8 -32 34 -52.5t59 -20.5h694q42 0 76 26l507 379q56 43 56 110q0 52 -37.5 88.5t-89.5 36.5q-43 0 -77 -26
+l-307 -230v227q0 4 32 138t68 282t39 161q4 18 4 29q0 47 -32 81t-79 34q-39 0 -69.5 -24t-39.5 -62l-116 -482h-26l150 624q3 14 3 28q0 48 -31.5 82t-79.5 34z" />
+    <glyph glyph-name="_565" unicode="&#xf25a;" horiz-adv-x="1792" 
+d="M640 1408q-53 0 -90.5 -37.5t-37.5 -90.5v-512v-384l-151 202q-41 54 -107 54q-52 0 -89 -38t-37 -90q0 -43 26 -77l384 -512q38 -51 102 -51h718q22 0 39.5 13.5t22.5 34.5l92 368q24 96 24 194v217q0 41 -28 71t-68 30t-68 -28t-28 -68h-32v61q0 48 -32 81.5t-80 33.5
+q-46 0 -79 -33t-33 -79v-64h-32v90q0 55 -37 94.5t-91 39.5q-53 0 -90.5 -37.5t-37.5 -90.5v-96h-32v570q0 55 -37 94.5t-91 39.5zM640 1536q107 0 181.5 -77.5t74.5 -184.5v-220q22 2 32 2q99 0 173 -69q47 21 99 21q113 0 184 -87q27 7 56 7q94 0 159 -67.5t65 -161.5
+v-217q0 -116 -28 -225l-92 -368q-16 -64 -68 -104.5t-118 -40.5h-718q-60 0 -114.5 27.5t-90.5 74.5l-384 512q-51 68 -51 154q0 105 74.5 180.5t179.5 75.5q71 0 130 -35v547q0 106 75 181t181 75zM768 128v384h-32v-384h32zM1024 128v384h-32v-384h32zM1280 128v384h-32
+v-384h32z" />
+    <glyph glyph-name="_566" unicode="&#xf25b;" 
+d="M1288 889q60 0 107 -23q141 -63 141 -226v-177q0 -94 -23 -186l-85 -339q-21 -86 -90.5 -140t-157.5 -54h-668q-106 0 -181 75t-75 181v401l-239 628q-17 45 -17 91q0 106 75 181t181 75q80 0 145.5 -45.5t93.5 -119.5l17 -44v113q0 106 75 181t181 75t181 -75t75 -181
+v-261q27 5 48 5q69 0 127.5 -36.5t88.5 -98.5zM1072 896q-33 0 -60.5 -18t-41.5 -48l-74 -163l-71 -155h55q50 0 90 -31.5t50 -80.5l154 338q10 20 10 46q0 46 -33 79t-79 33zM1293 761q-22 0 -40.5 -8t-29 -16t-23.5 -29.5t-17 -30.5t-17 -37l-132 -290q-10 -20 -10 -46
+q0 -46 33 -79t79 -33q33 0 60.5 18t41.5 48l160 352q9 18 9 38q0 50 -32 81.5t-82 31.5zM128 1120q0 -22 8 -46l248 -650v-69l102 111q43 46 106 46h198l106 233v535q0 53 -37.5 90.5t-90.5 37.5t-90.5 -37.5t-37.5 -90.5v-640h-64l-200 526q-14 37 -47 59.5t-73 22.5
+q-53 0 -90.5 -37.5t-37.5 -90.5zM1180 -128q44 0 78.5 27t45.5 70l85 339q19 73 19 155v91l-141 -310q-17 -38 -53 -61t-78 -23q-53 0 -93.5 34.5t-48.5 86.5q-44 -57 -114 -57h-208v32h208q46 0 81 33t35 79t-31 79t-77 33h-296q-49 0 -82 -36l-126 -136v-308
+q0 -53 37.5 -90.5t90.5 -37.5h668z" />
+    <glyph glyph-name="_567" unicode="&#xf25c;" horiz-adv-x="1973" 
+d="M857 992v-117q0 -13 -9.5 -22t-22.5 -9h-298v-812q0 -13 -9 -22.5t-22 -9.5h-135q-13 0 -22.5 9t-9.5 23v812h-297q-13 0 -22.5 9t-9.5 22v117q0 14 9 23t23 9h793q13 0 22.5 -9.5t9.5 -22.5zM1895 995l77 -961q1 -13 -8 -24q-10 -10 -23 -10h-134q-12 0 -21 8.5
+t-10 20.5l-46 588l-189 -425q-8 -19 -29 -19h-120q-20 0 -29 19l-188 427l-45 -590q-1 -12 -10 -20.5t-21 -8.5h-135q-13 0 -23 10q-9 10 -9 24l78 961q1 12 10 20.5t21 8.5h142q20 0 29 -19l220 -520q10 -24 20 -51q3 7 9.5 24.5t10.5 26.5l221 520q9 19 29 19h141
+q13 0 22 -8.5t10 -20.5z" />
+    <glyph glyph-name="_568" unicode="&#xf25d;" horiz-adv-x="1792" 
+d="M1042 833q0 88 -60 121q-33 18 -117 18h-123v-281h162q66 0 102 37t36 105zM1094 548l205 -373q8 -17 -1 -31q-8 -16 -27 -16h-152q-20 0 -28 17l-194 365h-155v-350q0 -14 -9 -23t-23 -9h-134q-14 0 -23 9t-9 23v960q0 14 9 23t23 9h294q128 0 190 -24q85 -31 134 -109
+t49 -180q0 -92 -42.5 -165.5t-115.5 -109.5q6 -10 9 -16zM896 1376q-150 0 -286 -58.5t-234.5 -157t-157 -234.5t-58.5 -286t58.5 -286t157 -234.5t234.5 -157t286 -58.5t286 58.5t234.5 157t157 234.5t58.5 286t-58.5 286t-157 234.5t-234.5 157t-286 58.5zM1792 640
+q0 -182 -71 -348t-191 -286t-286 -191t-348 -71t-348 71t-286 191t-191 286t-71 348t71 348t191 286t286 191t348 71t348 -71t286 -191t191 -286t71 -348z" />
+    <glyph glyph-name="_569" unicode="&#xf25e;" horiz-adv-x="1792" 
+d="M605 303q153 0 257 104q14 18 3 36l-45 82q-6 13 -24 17q-16 2 -27 -11l-4 -3q-4 -4 -11.5 -10t-17.5 -13.5t-23.5 -14.5t-28.5 -13t-33.5 -9.5t-37.5 -3.5q-76 0 -125 50t-49 127q0 76 48 125.5t122 49.5q37 0 71.5 -14t50.5 -28l16 -14q11 -11 26 -10q16 2 24 14l53 78
+q13 20 -2 39q-3 4 -11 12t-30 23.5t-48.5 28t-67.5 22.5t-86 10q-148 0 -246 -96.5t-98 -240.5q0 -146 97 -241.5t247 -95.5zM1235 303q153 0 257 104q14 18 4 36l-45 82q-8 14 -25 17q-16 2 -27 -11l-4 -3q-4 -4 -11.5 -10t-17.5 -13.5t-23.5 -14.5t-28.5 -13t-33.5 -9.5
+t-37.5 -3.5q-76 0 -125 50t-49 127q0 76 48 125.5t122 49.5q37 0 71.5 -14t50.5 -28l16 -14q11 -11 26 -10q16 2 24 14l53 78q13 20 -2 39q-3 4 -11 12t-30 23.5t-48.5 28t-67.5 22.5t-86 10q-147 0 -245.5 -96.5t-98.5 -240.5q0 -146 97 -241.5t247 -95.5zM896 1376
+q-150 0 -286 -58.5t-234.5 -157t-157 -234.5t-58.5 -286t58.5 -286t157 -234.5t234.5 -157t286 -58.5t286 58.5t234.5 157t157 234.5t58.5 286t-58.5 286t-157 234.5t-234.5 157t-286 58.5zM896 1536q182 0 348 -71t286 -191t191 -286t71 -348t-71 -348t-191 -286t-286 -191
+t-348 -71t-348 71t-286 191t-191 286t-71 348t71 348t191 286t286 191t348 71z" />
+    <glyph glyph-name="f260" unicode="&#xf260;" horiz-adv-x="2048" 
+d="M736 736l384 -384l-384 -384l-672 672l672 672l168 -168l-96 -96l-72 72l-480 -480l480 -480l193 193l-289 287zM1312 1312l672 -672l-672 -672l-168 168l96 96l72 -72l480 480l-480 480l-193 -193l289 -287l-96 -96l-384 384z" />
+    <glyph glyph-name="f261" unicode="&#xf261;" horiz-adv-x="1792" 
+d="M717 182l271 271l-279 279l-88 -88l192 -191l-96 -96l-279 279l279 279l40 -40l87 87l-127 128l-454 -454zM1075 190l454 454l-454 454l-271 -271l279 -279l88 88l-192 191l96 96l279 -279l-279 -279l-40 40l-87 -88zM1792 640q0 -182 -71 -348t-191 -286t-286 -191
+t-348 -71t-348 71t-286 191t-191 286t-71 348t71 348t191 286t286 191t348 71t348 -71t286 -191t191 -286t71 -348z" />
+    <glyph glyph-name="_572" unicode="&#xf262;" horiz-adv-x="2304" 
+d="M651 539q0 -39 -27.5 -66.5t-65.5 -27.5q-39 0 -66.5 27.5t-27.5 66.5q0 38 27.5 65.5t66.5 27.5q38 0 65.5 -27.5t27.5 -65.5zM1805 540q0 -39 -27.5 -66.5t-66.5 -27.5t-66.5 27.5t-27.5 66.5t27.5 66t66.5 27t66.5 -27t27.5 -66zM765 539q0 79 -56.5 136t-136.5 57
+t-136.5 -56.5t-56.5 -136.5t56.5 -136.5t136.5 -56.5t136.5 56.5t56.5 136.5zM1918 540q0 80 -56.5 136.5t-136.5 56.5q-79 0 -136 -56.5t-57 -136.5t56.5 -136.5t136.5 -56.5t136.5 56.5t56.5 136.5zM850 539q0 -116 -81.5 -197.5t-196.5 -81.5q-116 0 -197.5 82t-81.5 197
+t82 196.5t197 81.5t196.5 -81.5t81.5 -196.5zM2004 540q0 -115 -81.5 -196.5t-197.5 -81.5q-115 0 -196.5 81.5t-81.5 196.5t81.5 196.5t196.5 81.5q116 0 197.5 -81.5t81.5 -196.5zM1040 537q0 191 -135.5 326.5t-326.5 135.5q-125 0 -231 -62t-168 -168.5t-62 -231.5
+t62 -231.5t168 -168.5t231 -62q191 0 326.5 135.5t135.5 326.5zM1708 1110q-254 111 -556 111q-319 0 -573 -110q117 0 223 -45.5t182.5 -122.5t122 -183t45.5 -223q0 115 43.5 219.5t118 180.5t177.5 123t217 50zM2187 537q0 191 -135 326.5t-326 135.5t-326.5 -135.5
+t-135.5 -326.5t135.5 -326.5t326.5 -135.5t326 135.5t135 326.5zM1921 1103h383q-44 -51 -75 -114.5t-40 -114.5q110 -151 110 -337q0 -156 -77 -288t-209 -208.5t-287 -76.5q-133 0 -249 56t-196 155q-47 -56 -129 -179q-11 22 -53.5 82.5t-74.5 97.5
+q-80 -99 -196.5 -155.5t-249.5 -56.5q-155 0 -287 76.5t-209 208.5t-77 288q0 186 110 337q-9 51 -40 114.5t-75 114.5h365q149 100 355 156.5t432 56.5q224 0 421 -56t348 -157z" />
+    <glyph glyph-name="f263" unicode="&#xf263;" horiz-adv-x="1280" 
+d="M640 629q-188 0 -321 133t-133 320q0 188 133 321t321 133t321 -133t133 -321q0 -187 -133 -320t-321 -133zM640 1306q-92 0 -157.5 -65.5t-65.5 -158.5q0 -92 65.5 -157.5t157.5 -65.5t157.5 65.5t65.5 157.5q0 93 -65.5 158.5t-157.5 65.5zM1163 574q13 -27 15 -49.5
+t-4.5 -40.5t-26.5 -38.5t-42.5 -37t-61.5 -41.5q-115 -73 -315 -94l73 -72l267 -267q30 -31 30 -74t-30 -73l-12 -13q-31 -30 -74 -30t-74 30q-67 68 -267 268l-267 -268q-31 -30 -74 -30t-73 30l-12 13q-31 30 -31 73t31 74l267 267l72 72q-203 21 -317 94
+q-39 25 -61.5 41.5t-42.5 37t-26.5 38.5t-4.5 40.5t15 49.5q10 20 28 35t42 22t56 -2t65 -35q5 -4 15 -11t43 -24.5t69 -30.5t92 -24t113 -11q91 0 174 25.5t120 50.5l38 25q33 26 65 35t56 2t42 -22t28 -35z" />
+    <glyph glyph-name="_574" unicode="&#xf264;" 
+d="M927 956q0 -66 -46.5 -112.5t-112.5 -46.5t-112.5 46.5t-46.5 112.5t46.5 112.5t112.5 46.5t112.5 -46.5t46.5 -112.5zM1141 593q-10 20 -28 32t-47.5 9.5t-60.5 -27.5q-10 -8 -29 -20t-81 -32t-127 -20t-124 18t-86 36l-27 18q-31 25 -60.5 27.5t-47.5 -9.5t-28 -32
+q-22 -45 -2 -74.5t87 -73.5q83 -53 226 -67l-51 -52q-142 -142 -191 -190q-22 -22 -22 -52.5t22 -52.5l9 -9q22 -22 52.5 -22t52.5 22l191 191q114 -115 191 -191q22 -22 52.5 -22t52.5 22l9 9q22 22 22 52.5t-22 52.5l-191 190l-52 52q141 14 225 67q67 44 87 73.5t-2 74.5
+zM1092 956q0 134 -95 229t-229 95t-229 -95t-95 -229t95 -229t229 -95t229 95t95 229zM1536 1120v-960q0 -119 -84.5 -203.5t-203.5 -84.5h-960q-119 0 -203.5 84.5t-84.5 203.5v960q0 119 84.5 203.5t203.5 84.5h960q119 0 203.5 -84.5t84.5 -203.5z" />
+    <glyph glyph-name="_575" unicode="&#xf265;" horiz-adv-x="1720" 
+d="M1565 1408q65 0 110 -45.5t45 -110.5v-519q0 -176 -68 -336t-182.5 -275t-274 -182.5t-334.5 -67.5q-176 0 -335.5 67.5t-274.5 182.5t-183 275t-68 336v519q0 64 46 110t110 46h1409zM861 344q47 0 82 33l404 388q37 35 37 85q0 49 -34.5 83.5t-83.5 34.5q-47 0 -82 -33
+l-323 -310l-323 310q-35 33 -81 33q-49 0 -83.5 -34.5t-34.5 -83.5q0 -51 36 -85l405 -388q33 -33 81 -33z" />
+    <glyph glyph-name="_576" unicode="&#xf266;" horiz-adv-x="2304" 
+d="M1494 -103l-295 695q-25 -49 -158.5 -305.5t-198.5 -389.5q-1 -1 -27.5 -0.5t-26.5 1.5q-82 193 -255.5 587t-259.5 596q-21 50 -66.5 107.5t-103.5 100.5t-102 43q0 5 -0.5 24t-0.5 27h583v-50q-39 -2 -79.5 -16t-66.5 -43t-10 -64q26 -59 216.5 -499t235.5 -540
+q31 61 140 266.5t131 247.5q-19 39 -126 281t-136 295q-38 69 -201 71v50l513 -1v-47q-60 -2 -93.5 -25t-12.5 -69q33 -70 87 -189.5t86 -187.5q110 214 173 363q24 55 -10 79.5t-129 26.5q1 7 1 25v24q64 0 170.5 0.5t180 1t92.5 0.5v-49q-62 -2 -119 -33t-90 -81
+l-213 -442q13 -33 127.5 -290t121.5 -274l441 1017q-14 38 -49.5 62.5t-65 31.5t-55.5 8v50l460 -4l1 -2l-1 -44q-139 -4 -201 -145q-526 -1216 -559 -1291h-49z" />
+    <glyph glyph-name="_577" unicode="&#xf267;" horiz-adv-x="1792" 
+d="M949 643q0 -26 -16.5 -45t-41.5 -19q-26 0 -45 16.5t-19 41.5q0 26 17 45t42 19t44 -16.5t19 -41.5zM964 585l350 581q-9 -8 -67.5 -62.5t-125.5 -116.5t-136.5 -127t-117 -110.5t-50.5 -51.5l-349 -580q7 7 67 62t126 116.5t136 127t117 111t50 50.5zM1611 640
+q0 -201 -104 -371q-3 2 -17 11t-26.5 16.5t-16.5 7.5q-13 0 -13 -13q0 -10 59 -44q-74 -112 -184.5 -190.5t-241.5 -110.5l-16 67q-1 10 -15 10q-5 0 -8 -5.5t-2 -9.5l16 -68q-72 -15 -146 -15q-199 0 -372 105q1 2 13 20.5t21.5 33.5t9.5 19q0 13 -13 13q-6 0 -17 -14.5
+t-22.5 -34.5t-13.5 -23q-113 75 -192 187.5t-110 244.5l69 15q10 3 10 15q0 5 -5.5 8t-10.5 2l-68 -15q-14 72 -14 139q0 206 109 379q2 -1 18.5 -12t30 -19t17.5 -8q13 0 13 12q0 6 -12.5 15.5t-32.5 21.5l-20 12q77 112 189 189t244 107l15 -67q2 -10 15 -10q5 0 8 5.5
+t2 10.5l-15 66q71 13 134 13q204 0 379 -109q-39 -56 -39 -65q0 -13 12 -13q11 0 48 64q111 -75 187.5 -186t107.5 -241l-56 -12q-10 -2 -10 -16q0 -5 5.5 -8t9.5 -2l57 13q14 -72 14 -140zM1696 640q0 163 -63.5 311t-170.5 255t-255 170.5t-311 63.5t-311 -63.5
+t-255 -170.5t-170.5 -255t-63.5 -311t63.5 -311t170.5 -255t255 -170.5t311 -63.5t311 63.5t255 170.5t170.5 255t63.5 311zM1792 640q0 -182 -71 -348t-191 -286t-286 -191t-348 -71t-348 71t-286 191t-191 286t-71 348t71 348t191 286t286 191t348 71t348 -71t286 -191
+t191 -286t71 -348z" />
+    <glyph glyph-name="_578" unicode="&#xf268;" horiz-adv-x="1792" 
+d="M893 1536q240 2 451 -120q232 -134 352 -372l-742 39q-160 9 -294 -74.5t-185 -229.5l-276 424q128 159 311 245.5t383 87.5zM146 1131l337 -663q72 -143 211 -217t293 -45l-230 -451q-212 33 -385 157.5t-272.5 316t-99.5 411.5q0 267 146 491zM1732 962
+q58 -150 59.5 -310.5t-48.5 -306t-153 -272t-246 -209.5q-230 -133 -498 -119l405 623q88 131 82.5 290.5t-106.5 277.5zM896 942q125 0 213.5 -88.5t88.5 -213.5t-88.5 -213.5t-213.5 -88.5t-213.5 88.5t-88.5 213.5t88.5 213.5t213.5 88.5z" />
+    <glyph glyph-name="_579" unicode="&#xf269;" horiz-adv-x="1792" 
+d="M903 -256q-283 0 -504.5 150.5t-329.5 398.5q-58 131 -67 301t26 332.5t111 312t179 242.5l-11 -281q11 14 68 15.5t70 -15.5q42 81 160.5 138t234.5 59q-54 -45 -119.5 -148.5t-58.5 -163.5q25 -8 62.5 -13.5t63 -7.5t68 -4t50.5 -3q15 -5 9.5 -45.5t-30.5 -75.5
+q-5 -7 -16.5 -18.5t-56.5 -35.5t-101 -34l15 -189l-139 67q-18 -43 -7.5 -81.5t36 -66.5t65.5 -41.5t81 -6.5q51 9 98 34.5t83.5 45t73.5 17.5q61 -4 89.5 -33t19.5 -65q-1 -2 -2.5 -5.5t-8.5 -12.5t-18 -15.5t-31.5 -10.5t-46.5 -1q-60 -95 -144.5 -135.5t-209.5 -29.5
+q74 -61 162.5 -82.5t168.5 -6t154.5 52t128 87.5t80.5 104q43 91 39 192.5t-37.5 188.5t-78.5 125q87 -38 137 -79.5t77 -112.5q15 170 -57.5 343t-209.5 284q265 -77 412 -279.5t151 -517.5q2 -127 -40.5 -255t-123.5 -238t-189 -196t-247.5 -135.5t-288.5 -49.5z" />
+    <glyph glyph-name="_580" unicode="&#xf26a;" horiz-adv-x="1792" 
+d="M1493 1308q-165 110 -359 110q-155 0 -293 -73t-240 -200q-75 -93 -119.5 -218t-48.5 -266v-42q4 -141 48.5 -266t119.5 -218q102 -127 240 -200t293 -73q194 0 359 110q-121 -108 -274.5 -168t-322.5 -60q-29 0 -43 1q-175 8 -333 82t-272 193t-181 281t-67 339
+q0 182 71 348t191 286t286 191t348 71h3q168 -1 320.5 -60.5t273.5 -167.5zM1792 640q0 -192 -77 -362.5t-213 -296.5q-104 -63 -222 -63q-137 0 -255 84q154 56 253.5 233t99.5 405q0 227 -99 404t-253 234q119 83 254 83q119 0 226 -65q135 -125 210.5 -295t75.5 -361z
+" />
+    <glyph glyph-name="_581" unicode="&#xf26b;" horiz-adv-x="1792" 
+d="M1792 599q0 -56 -7 -104h-1151q0 -146 109.5 -244.5t257.5 -98.5q99 0 185.5 46.5t136.5 130.5h423q-56 -159 -170.5 -281t-267.5 -188.5t-321 -66.5q-187 0 -356 83q-228 -116 -394 -116q-237 0 -237 263q0 115 45 275q17 60 109 229q199 360 475 606
+q-184 -79 -427 -354q63 274 283.5 449.5t501.5 175.5q30 0 45 -1q255 117 433 117q64 0 116 -13t94.5 -40.5t66.5 -76.5t24 -115q0 -116 -75 -286q101 -182 101 -390zM1722 1239q0 83 -53 132t-137 49q-108 0 -254 -70q121 -47 222.5 -131.5t170.5 -195.5q51 135 51 216z
+M128 2q0 -86 48.5 -132.5t134.5 -46.5q115 0 266 83q-122 72 -213.5 183t-137.5 245q-98 -205 -98 -332zM632 715h728q-5 142 -113 237t-251 95q-144 0 -251.5 -95t-112.5 -237z" />
+    <glyph glyph-name="_582" unicode="&#xf26c;" horiz-adv-x="2048" 
+d="M1792 288v960q0 13 -9.5 22.5t-22.5 9.5h-1600q-13 0 -22.5 -9.5t-9.5 -22.5v-960q0 -13 9.5 -22.5t22.5 -9.5h1600q13 0 22.5 9.5t9.5 22.5zM1920 1248v-960q0 -66 -47 -113t-113 -47h-736v-128h352q14 0 23 -9t9 -23v-64q0 -14 -9 -23t-23 -9h-832q-14 0 -23 9t-9 23
+v64q0 14 9 23t23 9h352v128h-736q-66 0 -113 47t-47 113v960q0 66 47 113t113 47h1600q66 0 113 -47t47 -113z" />
+    <glyph glyph-name="_583" unicode="&#xf26d;" horiz-adv-x="1792" 
+d="M138 1408h197q-70 -64 -126 -149q-36 -56 -59 -115t-30 -125.5t-8.5 -120t10.5 -132t21 -126t28 -136.5q4 -19 6 -28q51 -238 81 -329q57 -171 152 -275h-272q-48 0 -82 34t-34 82v1304q0 48 34 82t82 34zM1346 1408h308q48 0 82 -34t34 -82v-1304q0 -48 -34 -82t-82 -34
+h-178q212 210 196 565l-469 -101q-2 -45 -12 -82t-31 -72t-59.5 -59.5t-93.5 -36.5q-123 -26 -199 40q-32 27 -53 61t-51.5 129t-64.5 258q-35 163 -45.5 263t-5.5 139t23 77q20 41 62.5 73t102.5 45q45 12 83.5 6.5t67 -17t54 -35t43 -48t34.5 -56.5l468 100
+q-68 175 -180 287z" />
+    <glyph glyph-name="_584" unicode="&#xf26e;" 
+d="M1401 -11l-6 -6q-113 -113 -259 -175q-154 -64 -317 -64q-165 0 -317 64q-148 63 -259 175q-113 112 -175 258q-42 103 -54 189q-4 28 48 36q51 8 56 -20q1 -1 1 -4q18 -90 46 -159q50 -124 152 -226q98 -98 226 -152q132 -56 276 -56q143 0 276 56q128 55 225 152l6 6
+q10 10 25 6q12 -3 33 -22q36 -37 17 -58zM929 604l-66 -66l63 -63q21 -21 -7 -49q-17 -17 -32 -17q-10 0 -19 10l-62 61l-66 -66q-5 -5 -15 -5q-15 0 -31 16l-2 2q-18 15 -18 29q0 7 8 17l66 65l-66 66q-16 16 14 45q18 18 31 18q6 0 13 -5l65 -66l65 65q18 17 48 -13
+q27 -27 11 -44zM1400 547q0 -118 -46 -228q-45 -105 -126 -186q-80 -80 -187 -126t-228 -46t-228 46t-187 126q-82 82 -125 186q-15 33 -15 40h-1q-9 27 43 44q50 16 60 -12q37 -99 97 -167h1v339v2q3 136 102 232q105 103 253 103q147 0 251 -103t104 -249
+q0 -147 -104.5 -251t-250.5 -104q-58 0 -112 16q-28 11 -13 61q16 51 44 43l14 -3q14 -3 33 -6t30 -3q104 0 176 71.5t72 174.5q0 101 -72 171q-71 71 -175 71q-107 0 -178 -80q-64 -72 -64 -160v-413q110 -67 242 -67q96 0 185 36.5t156 103.5t103.5 155t36.5 183
+q0 198 -141 339q-140 140 -339 140q-200 0 -340 -140q-53 -53 -77 -87l-2 -2q-8 -11 -13 -15.5t-21.5 -9.5t-38.5 3q-21 5 -36.5 16.5t-15.5 26.5v680q0 15 10.5 26.5t27.5 11.5h877q30 0 30 -55t-30 -55h-811v-483h1q40 42 102 84t108 61q109 46 231 46q121 0 228 -46
+t187 -126q81 -81 126 -186q46 -112 46 -229zM1369 1128q9 -8 9 -18t-5.5 -18t-16.5 -21q-26 -26 -39 -26q-9 0 -16 7q-106 91 -207 133q-128 56 -276 56q-133 0 -262 -49q-27 -10 -45 37q-9 25 -8 38q3 16 16 20q130 57 299 57q164 0 316 -64q137 -58 235 -152z" />
+    <glyph glyph-name="_585" unicode="&#xf270;" horiz-adv-x="1792" 
+d="M1551 60q15 6 26 3t11 -17.5t-15 -33.5q-13 -16 -44 -43.5t-95.5 -68t-141 -74t-188 -58t-229.5 -24.5q-119 0 -238 31t-209 76.5t-172.5 104t-132.5 105t-84 87.5q-8 9 -10 16.5t1 12t8 7t11.5 2t11.5 -4.5q192 -117 300 -166q389 -176 799 -90q190 40 391 135z
+M1758 175q11 -16 2.5 -69.5t-28.5 -102.5q-34 -83 -85 -124q-17 -14 -26 -9t0 24q21 45 44.5 121.5t6.5 98.5q-5 7 -15.5 11.5t-27 6t-29.5 2.5t-35 0t-31.5 -2t-31 -3t-22.5 -2q-6 -1 -13 -1.5t-11 -1t-8.5 -1t-7 -0.5h-5.5h-4.5t-3 0.5t-2 1.5l-1.5 3q-6 16 47 40t103 30
+q46 7 108 1t76 -24zM1364 618q0 -31 13.5 -64t32 -58t37.5 -46t33 -32l13 -11l-227 -224q-40 37 -79 75.5t-58 58.5l-19 20q-11 11 -25 33q-38 -59 -97.5 -102.5t-127.5 -63.5t-140 -23t-137.5 21t-117.5 65.5t-83 113t-31 162.5q0 84 28 154t72 116.5t106.5 83t122.5 57
+t130 34.5t119.5 18.5t99.5 6.5v127q0 65 -21 97q-34 53 -121 53q-6 0 -16.5 -1t-40.5 -12t-56 -29.5t-56 -59.5t-48 -96l-294 27q0 60 22 119t67 113t108 95t151.5 65.5t190.5 24.5q100 0 181 -25t129.5 -61.5t81 -83t45 -86t12.5 -73.5v-589zM692 597q0 -86 70 -133
+q66 -44 139 -22q84 25 114 123q14 45 14 101v162q-59 -2 -111 -12t-106.5 -33.5t-87 -71t-32.5 -114.5z" />
+    <glyph glyph-name="_586" unicode="&#xf271;" horiz-adv-x="1792" 
+d="M1536 1280q52 0 90 -38t38 -90v-1280q0 -52 -38 -90t-90 -38h-1408q-52 0 -90 38t-38 90v1280q0 52 38 90t90 38h128v96q0 66 47 113t113 47h64q66 0 113 -47t47 -113v-96h384v96q0 66 47 113t113 47h64q66 0 113 -47t47 -113v-96h128zM1152 1376v-288q0 -14 9 -23t23 -9
+h64q14 0 23 9t9 23v288q0 14 -9 23t-23 9h-64q-14 0 -23 -9t-9 -23zM384 1376v-288q0 -14 9 -23t23 -9h64q14 0 23 9t9 23v288q0 14 -9 23t-23 9h-64q-14 0 -23 -9t-9 -23zM1536 -128v1024h-1408v-1024h1408zM896 448h224q14 0 23 -9t9 -23v-64q0 -14 -9 -23t-23 -9h-224
+v-224q0 -14 -9 -23t-23 -9h-64q-14 0 -23 9t-9 23v224h-224q-14 0 -23 9t-9 23v64q0 14 9 23t23 9h224v224q0 14 9 23t23 9h64q14 0 23 -9t9 -23v-224z" />
+    <glyph glyph-name="_587" unicode="&#xf272;" horiz-adv-x="1792" 
+d="M1152 416v-64q0 -14 -9 -23t-23 -9h-576q-14 0 -23 9t-9 23v64q0 14 9 23t23 9h576q14 0 23 -9t9 -23zM128 -128h1408v1024h-1408v-1024zM512 1088v288q0 14 -9 23t-23 9h-64q-14 0 -23 -9t-9 -23v-288q0 -14 9 -23t23 -9h64q14 0 23 9t9 23zM1280 1088v288q0 14 -9 23
+t-23 9h-64q-14 0 -23 -9t-9 -23v-288q0 -14 9 -23t23 -9h64q14 0 23 9t9 23zM1664 1152v-1280q0 -52 -38 -90t-90 -38h-1408q-52 0 -90 38t-38 90v1280q0 52 38 90t90 38h128v96q0 66 47 113t113 47h64q66 0 113 -47t47 -113v-96h384v96q0 66 47 113t113 47h64q66 0 113 -47
+t47 -113v-96h128q52 0 90 -38t38 -90z" />
+    <glyph glyph-name="_588" unicode="&#xf273;" horiz-adv-x="1792" 
+d="M1111 151l-46 -46q-9 -9 -22 -9t-23 9l-188 189l-188 -189q-10 -9 -23 -9t-22 9l-46 46q-9 9 -9 22t9 23l189 188l-189 188q-9 10 -9 23t9 22l46 46q9 9 22 9t23 -9l188 -188l188 188q10 9 23 9t22 -9l46 -46q9 -9 9 -22t-9 -23l-188 -188l188 -188q9 -10 9 -23t-9 -22z
+M128 -128h1408v1024h-1408v-1024zM512 1088v288q0 14 -9 23t-23 9h-64q-14 0 -23 -9t-9 -23v-288q0 -14 9 -23t23 -9h64q14 0 23 9t9 23zM1280 1088v288q0 14 -9 23t-23 9h-64q-14 0 -23 -9t-9 -23v-288q0 -14 9 -23t23 -9h64q14 0 23 9t9 23zM1664 1152v-1280
+q0 -52 -38 -90t-90 -38h-1408q-52 0 -90 38t-38 90v1280q0 52 38 90t90 38h128v96q0 66 47 113t113 47h64q66 0 113 -47t47 -113v-96h384v96q0 66 47 113t113 47h64q66 0 113 -47t47 -113v-96h128q52 0 90 -38t38 -90z" />
+    <glyph glyph-name="_589" unicode="&#xf274;" horiz-adv-x="1792" 
+d="M1303 572l-512 -512q-10 -9 -23 -9t-23 9l-288 288q-9 10 -9 23t9 22l46 46q9 9 22 9t23 -9l220 -220l444 444q10 9 23 9t22 -9l46 -46q9 -9 9 -22t-9 -23zM128 -128h1408v1024h-1408v-1024zM512 1088v288q0 14 -9 23t-23 9h-64q-14 0 -23 -9t-9 -23v-288q0 -14 9 -23
+t23 -9h64q14 0 23 9t9 23zM1280 1088v288q0 14 -9 23t-23 9h-64q-14 0 -23 -9t-9 -23v-288q0 -14 9 -23t23 -9h64q14 0 23 9t9 23zM1664 1152v-1280q0 -52 -38 -90t-90 -38h-1408q-52 0 -90 38t-38 90v1280q0 52 38 90t90 38h128v96q0 66 47 113t113 47h64q66 0 113 -47
+t47 -113v-96h384v96q0 66 47 113t113 47h64q66 0 113 -47t47 -113v-96h128q52 0 90 -38t38 -90z" />
+    <glyph glyph-name="_590" unicode="&#xf275;" horiz-adv-x="1792" 
+d="M448 1536q26 0 45 -19t19 -45v-891l536 429q17 14 40 14q26 0 45 -19t19 -45v-379l536 429q17 14 40 14q26 0 45 -19t19 -45v-1152q0 -26 -19 -45t-45 -19h-1664q-26 0 -45 19t-19 45v1664q0 26 19 45t45 19h384z" />
+    <glyph glyph-name="_591" unicode="&#xf276;" horiz-adv-x="1024" 
+d="M512 448q66 0 128 15v-655q0 -26 -19 -45t-45 -19h-128q-26 0 -45 19t-19 45v655q62 -15 128 -15zM512 1536q212 0 362 -150t150 -362t-150 -362t-362 -150t-362 150t-150 362t150 362t362 150zM512 1312q14 0 23 9t9 23t-9 23t-23 9q-146 0 -249 -103t-103 -249
+q0 -14 9 -23t23 -9t23 9t9 23q0 119 84.5 203.5t203.5 84.5z" />
+    <glyph glyph-name="_592" unicode="&#xf277;" horiz-adv-x="1792" 
+d="M1745 1239q10 -10 10 -23t-10 -23l-141 -141q-28 -28 -68 -28h-1344q-26 0 -45 19t-19 45v256q0 26 19 45t45 19h576v64q0 26 19 45t45 19h128q26 0 45 -19t19 -45v-64h512q40 0 68 -28zM768 320h256v-512q0 -26 -19 -45t-45 -19h-128q-26 0 -45 19t-19 45v512zM1600 768
+q26 0 45 -19t19 -45v-256q0 -26 -19 -45t-45 -19h-1344q-40 0 -68 28l-141 141q-10 10 -10 23t10 23l141 141q28 28 68 28h512v192h256v-192h576z" />
+    <glyph glyph-name="_593" unicode="&#xf278;" horiz-adv-x="2048" 
+d="M2020 1525q28 -20 28 -53v-1408q0 -20 -11 -36t-29 -23l-640 -256q-24 -11 -48 0l-616 246l-616 -246q-10 -5 -24 -5q-19 0 -36 11q-28 20 -28 53v1408q0 20 11 36t29 23l640 256q24 11 48 0l616 -246l616 246q32 13 60 -6zM736 1390v-1270l576 -230v1270zM128 1173
+v-1270l544 217v1270zM1920 107v1270l-544 -217v-1270z" />
+    <glyph glyph-name="_594" unicode="&#xf279;" horiz-adv-x="1792" 
+d="M512 1536q13 0 22.5 -9.5t9.5 -22.5v-1472q0 -20 -17 -28l-480 -256q-7 -4 -15 -4q-13 0 -22.5 9.5t-9.5 22.5v1472q0 20 17 28l480 256q7 4 15 4zM1760 1536q13 0 22.5 -9.5t9.5 -22.5v-1472q0 -20 -17 -28l-480 -256q-7 -4 -15 -4q-13 0 -22.5 9.5t-9.5 22.5v1472
+q0 20 17 28l480 256q7 4 15 4zM640 1536q8 0 14 -3l512 -256q18 -10 18 -29v-1472q0 -13 -9.5 -22.5t-22.5 -9.5q-8 0 -14 3l-512 256q-18 10 -18 29v1472q0 13 9.5 22.5t22.5 9.5z" />
+    <glyph glyph-name="_595" unicode="&#xf27a;" horiz-adv-x="1792" 
+d="M640 640q0 53 -37.5 90.5t-90.5 37.5t-90.5 -37.5t-37.5 -90.5t37.5 -90.5t90.5 -37.5t90.5 37.5t37.5 90.5zM1024 640q0 53 -37.5 90.5t-90.5 37.5t-90.5 -37.5t-37.5 -90.5t37.5 -90.5t90.5 -37.5t90.5 37.5t37.5 90.5zM1408 640q0 53 -37.5 90.5t-90.5 37.5
+t-90.5 -37.5t-37.5 -90.5t37.5 -90.5t90.5 -37.5t90.5 37.5t37.5 90.5zM1792 640q0 -174 -120 -321.5t-326 -233t-450 -85.5q-110 0 -211 18q-173 -173 -435 -229q-52 -10 -86 -13q-12 -1 -22 6t-13 18q-4 15 20 37q5 5 23.5 21.5t25.5 23.5t23.5 25.5t24 31.5t20.5 37
+t20 48t14.5 57.5t12.5 72.5q-146 90 -229.5 216.5t-83.5 269.5q0 174 120 321.5t326 233t450 85.5t450 -85.5t326 -233t120 -321.5z" />
+    <glyph glyph-name="_596" unicode="&#xf27b;" horiz-adv-x="1792" 
+d="M640 640q0 -53 -37.5 -90.5t-90.5 -37.5t-90.5 37.5t-37.5 90.5t37.5 90.5t90.5 37.5t90.5 -37.5t37.5 -90.5zM1024 640q0 -53 -37.5 -90.5t-90.5 -37.5t-90.5 37.5t-37.5 90.5t37.5 90.5t90.5 37.5t90.5 -37.5t37.5 -90.5zM1408 640q0 -53 -37.5 -90.5t-90.5 -37.5
+t-90.5 37.5t-37.5 90.5t37.5 90.5t90.5 37.5t90.5 -37.5t37.5 -90.5zM896 1152q-204 0 -381.5 -69.5t-282 -187.5t-104.5 -255q0 -112 71.5 -213.5t201.5 -175.5l87 -50l-27 -96q-24 -91 -70 -172q152 63 275 171l43 38l57 -6q69 -8 130 -8q204 0 381.5 69.5t282 187.5
+t104.5 255t-104.5 255t-282 187.5t-381.5 69.5zM1792 640q0 -174 -120 -321.5t-326 -233t-450 -85.5q-70 0 -145 8q-198 -175 -460 -242q-49 -14 -114 -22h-5q-15 0 -27 10.5t-16 27.5v1q-3 4 -0.5 12t2 10t4.5 9.5l6 9t7 8.5t8 9q7 8 31 34.5t34.5 38t31 39.5t32.5 51
+t27 59t26 76q-157 89 -247.5 220t-90.5 281q0 130 71 248.5t191 204.5t286 136.5t348 50.5t348 -50.5t286 -136.5t191 -204.5t71 -248.5z" />
+    <glyph glyph-name="_597" unicode="&#xf27c;" horiz-adv-x="1024" 
+d="M512 345l512 295v-591l-512 -296v592zM0 640v-591l512 296zM512 1527v-591l-512 -296v591zM512 936l512 295v-591z" />
+    <glyph glyph-name="_598" unicode="&#xf27d;" horiz-adv-x="1792" 
+d="M1709 1018q-10 -236 -332 -651q-333 -431 -562 -431q-142 0 -240 263q-44 160 -132 482q-72 262 -157 262q-18 0 -127 -76l-77 98q24 21 108 96.5t130 115.5q156 138 241 146q95 9 153 -55.5t81 -203.5q44 -287 66 -373q55 -249 120 -249q51 0 154 161q101 161 109 246
+q13 139 -109 139q-57 0 -121 -26q120 393 459 382q251 -8 236 -326z" />
+    <glyph glyph-name="f27e" unicode="&#xf27e;" 
+d="M0 1408h1536v-1536h-1536v1536zM1085 293l-221 631l221 297h-634l221 -297l-221 -631l317 -304z" />
+    <glyph glyph-name="uniF280" unicode="&#xf280;" 
+d="M0 1408h1536v-1536h-1536v1536zM908 1088l-12 -33l75 -83l-31 -114l25 -25l107 57l107 -57l25 25l-31 114l75 83l-12 33h-95l-53 96h-32l-53 -96h-95zM641 925q32 0 44.5 -16t11.5 -63l174 21q0 55 -17.5 92.5t-50.5 56t-69 25.5t-85 7q-133 0 -199 -57.5t-66 -182.5v-72
+h-96v-128h76q20 0 20 -8v-382q0 -14 -5 -20t-18 -7l-73 -7v-88h448v86l-149 14q-6 1 -8.5 1.5t-3.5 2.5t-0.5 4t1 7t0.5 10v387h191l38 128h-231q-6 0 -2 6t4 9v80q0 27 1.5 40.5t7.5 28t19.5 20t36.5 5.5zM1248 96v86l-54 9q-7 1 -9.5 2.5t-2.5 3t1 7.5t1 12v520h-275
+l-23 -101l83 -22q23 -7 23 -27v-370q0 -14 -6 -18.5t-20 -6.5l-70 -9v-86h352z" />
+    <glyph glyph-name="uniF281" unicode="&#xf281;" horiz-adv-x="1792" 
+d="M1792 690q0 -58 -29.5 -105.5t-79.5 -72.5q12 -46 12 -96q0 -155 -106.5 -287t-290.5 -208.5t-400 -76.5t-399.5 76.5t-290 208.5t-106.5 287q0 47 11 94q-51 25 -82 73.5t-31 106.5q0 82 58 140.5t141 58.5q85 0 145 -63q218 152 515 162l116 521q3 13 15 21t26 5
+l369 -81q18 37 54 59.5t79 22.5q62 0 106 -43.5t44 -105.5t-44 -106t-106 -44t-105.5 43.5t-43.5 105.5l-334 74l-104 -472q300 -9 519 -160q58 61 143 61q83 0 141 -58.5t58 -140.5zM418 491q0 -62 43.5 -106t105.5 -44t106 44t44 106t-44 105.5t-106 43.5q-61 0 -105 -44
+t-44 -105zM1228 136q11 11 11 26t-11 26q-10 10 -25 10t-26 -10q-41 -42 -121 -62t-160 -20t-160 20t-121 62q-11 10 -26 10t-25 -10q-11 -10 -11 -25.5t11 -26.5q43 -43 118.5 -68t122.5 -29.5t91 -4.5t91 4.5t122.5 29.5t118.5 68zM1225 341q62 0 105.5 44t43.5 106
+q0 61 -44 105t-105 44q-62 0 -106 -43.5t-44 -105.5t44 -106t106 -44z" />
+    <glyph glyph-name="_602" unicode="&#xf282;" horiz-adv-x="1792" 
+d="M69 741h1q16 126 58.5 241.5t115 217t167.5 176t223.5 117.5t276.5 43q231 0 414 -105.5t294 -303.5q104 -187 104 -442v-188h-1125q1 -111 53.5 -192.5t136.5 -122.5t189.5 -57t213 -3t208 46.5t173.5 84.5v-377q-92 -55 -229.5 -92t-312.5 -38t-316 53
+q-189 73 -311.5 249t-124.5 372q-3 242 111 412t325 268q-48 -60 -78 -125.5t-46 -159.5h635q8 77 -8 140t-47 101.5t-70.5 66.5t-80.5 41t-75 20.5t-56 8.5l-22 1q-135 -5 -259.5 -44.5t-223.5 -104.5t-176 -140.5t-138 -163.5z" />
+    <glyph glyph-name="_603" unicode="&#xf283;" horiz-adv-x="2304" 
+d="M0 32v608h2304v-608q0 -66 -47 -113t-113 -47h-1984q-66 0 -113 47t-47 113zM640 256v-128h384v128h-384zM256 256v-128h256v128h-256zM2144 1408q66 0 113 -47t47 -113v-224h-2304v224q0 66 47 113t113 47h1984z" />
+    <glyph glyph-name="_604" unicode="&#xf284;" horiz-adv-x="1792" 
+d="M1584 246l-218 111q-74 -120 -196.5 -189t-263.5 -69q-147 0 -271 72t-196 196t-72 270q0 110 42.5 209.5t115 172t172 115t209.5 42.5q131 0 247.5 -60.5t192.5 -168.5l215 125q-110 169 -286.5 265t-378.5 96q-161 0 -308 -63t-253 -169t-169 -253t-63 -308t63 -308
+t169 -253t253 -169t308 -63q213 0 397.5 107t290.5 292zM1030 643l693 -352q-116 -253 -334.5 -400t-492.5 -147q-182 0 -348 71t-286 191t-191 286t-71 348t71 348t191 286t286 191t348 71q260 0 470.5 -133.5t335.5 -366.5zM1543 640h-39v-160h-96v352h136q32 0 54.5 -20
+t28.5 -48t1 -56t-27.5 -48t-57.5 -20z" />
+    <glyph glyph-name="uniF285" unicode="&#xf285;" horiz-adv-x="1792" 
+d="M1427 827l-614 386l92 151h855zM405 562l-184 116v858l1183 -743zM1424 697l147 -95v-858l-532 335zM1387 718l-500 -802h-855l356 571z" />
+    <glyph glyph-name="uniF286" unicode="&#xf286;" horiz-adv-x="1792" 
+d="M640 528v224q0 16 -16 16h-96q-16 0 -16 -16v-224q0 -16 16 -16h96q16 0 16 16zM1152 528v224q0 16 -16 16h-96q-16 0 -16 -16v-224q0 -16 16 -16h96q16 0 16 16zM1664 496v-752h-640v320q0 80 -56 136t-136 56t-136 -56t-56 -136v-320h-640v752q0 16 16 16h96
+q16 0 16 -16v-112h128v624q0 16 16 16h96q16 0 16 -16v-112h128v112q0 16 16 16h96q16 0 16 -16v-112h128v112q0 6 2.5 9.5t8.5 5t9.5 2t11.5 0t9 -0.5v391q-32 15 -32 50q0 23 16.5 39t38.5 16t38.5 -16t16.5 -39q0 -35 -32 -50v-17q45 10 83 10q21 0 59.5 -7.5t54.5 -7.5
+q17 0 47 7.5t37 7.5q16 0 16 -16v-210q0 -15 -35 -21.5t-62 -6.5q-18 0 -54.5 7.5t-55.5 7.5q-40 0 -90 -12v-133q1 0 9 0.5t11.5 0t9.5 -2t8.5 -5t2.5 -9.5v-112h128v112q0 16 16 16h96q16 0 16 -16v-112h128v112q0 16 16 16h96q16 0 16 -16v-624h128v112q0 16 16 16h96
+q16 0 16 -16z" />
+    <glyph glyph-name="_607" unicode="&#xf287;" horiz-adv-x="2304" 
+d="M2288 731q16 -8 16 -27t-16 -27l-320 -192q-8 -5 -16 -5q-9 0 -16 4q-16 10 -16 28v128h-858q37 -58 83 -165q16 -37 24.5 -55t24 -49t27 -47t27 -34t31.5 -26t33 -8h96v96q0 14 9 23t23 9h320q14 0 23 -9t9 -23v-320q0 -14 -9 -23t-23 -9h-320q-14 0 -23 9t-9 23v96h-96
+q-32 0 -61 10t-51 23.5t-45 40.5t-37 46t-33.5 57t-28.5 57.5t-28 60.5q-23 53 -37 81.5t-36 65t-44.5 53.5t-46.5 17h-360q-22 -84 -91 -138t-157 -54q-106 0 -181 75t-75 181t75 181t181 75q88 0 157 -54t91 -138h104q24 0 46.5 17t44.5 53.5t36 65t37 81.5q19 41 28 60.5
+t28.5 57.5t33.5 57t37 46t45 40.5t51 23.5t61 10h107q21 57 70 92.5t111 35.5q80 0 136 -56t56 -136t-56 -136t-136 -56q-62 0 -111 35.5t-70 92.5h-107q-17 0 -33 -8t-31.5 -26t-27 -34t-27 -47t-24 -49t-24.5 -55q-46 -107 -83 -165h1114v128q0 18 16 28t32 -1z" />
+    <glyph glyph-name="_608" unicode="&#xf288;" horiz-adv-x="1792" 
+d="M1150 774q0 -56 -39.5 -95t-95.5 -39h-253v269h253q56 0 95.5 -39.5t39.5 -95.5zM1329 774q0 130 -91.5 222t-222.5 92h-433v-896h180v269h253q130 0 222 91.5t92 221.5zM1792 640q0 -182 -71 -348t-191 -286t-286 -191t-348 -71t-348 71t-286 191t-191 286t-71 348
+t71 348t191 286t286 191t348 71t348 -71t286 -191t191 -286t71 -348z" />
+    <glyph glyph-name="_609" unicode="&#xf289;" horiz-adv-x="2304" 
+d="M1645 438q0 59 -34 106.5t-87 68.5q-7 -45 -23 -92q-7 -24 -27.5 -38t-44.5 -14q-12 0 -24 3q-31 10 -45 38.5t-4 58.5q23 71 23 143q0 123 -61 227.5t-166 165.5t-228 61q-134 0 -247 -73t-167 -194q108 -28 188 -106q22 -23 22 -55t-22 -54t-54 -22t-55 22
+q-75 75 -180 75q-106 0 -181 -74.5t-75 -180.5t75 -180.5t181 -74.5h1046q79 0 134.5 55.5t55.5 133.5zM1798 438q0 -142 -100.5 -242t-242.5 -100h-1046q-169 0 -289 119.5t-120 288.5q0 153 100 267t249 136q62 184 221 298t354 114q235 0 408.5 -158.5t196.5 -389.5
+q116 -25 192.5 -118.5t76.5 -214.5zM2048 438q0 -175 -97 -319q-23 -33 -64 -33q-24 0 -43 13q-26 17 -32 48.5t12 57.5q71 104 71 233t-71 233q-18 26 -12 57t32 49t57.5 11.5t49.5 -32.5q97 -142 97 -318zM2304 438q0 -244 -134 -443q-23 -34 -64 -34q-23 0 -42 13
+q-26 18 -32.5 49t11.5 57q108 164 108 358q0 195 -108 357q-18 26 -11.5 57.5t32.5 48.5q26 18 57 12t49 -33q134 -198 134 -442z" />
+    <glyph glyph-name="_610" unicode="&#xf28a;" 
+d="M1500 -13q0 -89 -63 -152.5t-153 -63.5t-153.5 63.5t-63.5 152.5q0 90 63.5 153.5t153.5 63.5t153 -63.5t63 -153.5zM1267 268q-115 -15 -192.5 -102.5t-77.5 -205.5q0 -74 33 -138q-146 -78 -379 -78q-109 0 -201 21t-153.5 54.5t-110.5 76.5t-76 85t-44.5 83
+t-23.5 66.5t-6 39.5q0 19 4.5 42.5t18.5 56t36.5 58t64 43.5t94.5 18t94 -17.5t63 -41t35.5 -53t17.5 -49t4 -33.5q0 -34 -23 -81q28 -27 82 -42t93 -17l40 -1q115 0 190 51t75 133q0 26 -9 48.5t-31.5 44.5t-49.5 41t-74 44t-93.5 47.5t-119.5 56.5q-28 13 -43 20
+q-116 55 -187 100t-122.5 102t-72 125.5t-20.5 162.5q0 78 20.5 150t66 137.5t112.5 114t166.5 77t221.5 28.5q120 0 220 -26t164.5 -67t109.5 -94t64 -105.5t19 -103.5q0 -46 -15 -82.5t-36.5 -58t-48.5 -36t-49 -19.5t-39 -5h-8h-32t-39 5t-44 14t-41 28t-37 46t-24 70.5
+t-10 97.5q-15 16 -59 25.5t-81 10.5l-37 1q-68 0 -117.5 -31t-70.5 -70t-21 -76q0 -24 5 -43t24 -46t53 -51t97 -53.5t150 -58.5q76 -25 138.5 -53.5t109 -55.5t83 -59t60.5 -59.5t41 -62.5t26.5 -62t14.5 -63.5t6 -62t1 -62.5z" />
+    <glyph glyph-name="_611" unicode="&#xf28b;" 
+d="M704 352v576q0 14 -9 23t-23 9h-256q-14 0 -23 -9t-9 -23v-576q0 -14 9 -23t23 -9h256q14 0 23 9t9 23zM1152 352v576q0 14 -9 23t-23 9h-256q-14 0 -23 -9t-9 -23v-576q0 -14 9 -23t23 -9h256q14 0 23 9t9 23zM1536 640q0 -209 -103 -385.5t-279.5 -279.5t-385.5 -103
+t-385.5 103t-279.5 279.5t-103 385.5t103 385.5t279.5 279.5t385.5 103t385.5 -103t279.5 -279.5t103 -385.5z" />
+    <glyph glyph-name="_612" unicode="&#xf28c;" 
+d="M768 1408q209 0 385.5 -103t279.5 -279.5t103 -385.5t-103 -385.5t-279.5 -279.5t-385.5 -103t-385.5 103t-279.5 279.5t-103 385.5t103 385.5t279.5 279.5t385.5 103zM768 96q148 0 273 73t198 198t73 273t-73 273t-198 198t-273 73t-273 -73t-198 -198t-73 -273
+t73 -273t198 -198t273 -73zM864 320q-14 0 -23 9t-9 23v576q0 14 9 23t23 9h192q14 0 23 -9t9 -23v-576q0 -14 -9 -23t-23 -9h-192zM480 320q-14 0 -23 9t-9 23v576q0 14 9 23t23 9h192q14 0 23 -9t9 -23v-576q0 -14 -9 -23t-23 -9h-192z" />
+    <glyph glyph-name="_613" unicode="&#xf28d;" 
+d="M1088 352v576q0 14 -9 23t-23 9h-576q-14 0 -23 -9t-9 -23v-576q0 -14 9 -23t23 -9h576q14 0 23 9t9 23zM1536 640q0 -209 -103 -385.5t-279.5 -279.5t-385.5 -103t-385.5 103t-279.5 279.5t-103 385.5t103 385.5t279.5 279.5t385.5 103t385.5 -103t279.5 -279.5
+t103 -385.5z" />
+    <glyph glyph-name="_614" unicode="&#xf28e;" 
+d="M768 1408q209 0 385.5 -103t279.5 -279.5t103 -385.5t-103 -385.5t-279.5 -279.5t-385.5 -103t-385.5 103t-279.5 279.5t-103 385.5t103 385.5t279.5 279.5t385.5 103zM768 96q148 0 273 73t198 198t73 273t-73 273t-198 198t-273 73t-273 -73t-198 -198t-73 -273
+t73 -273t198 -198t273 -73zM480 320q-14 0 -23 9t-9 23v576q0 14 9 23t23 9h576q14 0 23 -9t9 -23v-576q0 -14 -9 -23t-23 -9h-576z" />
+    <glyph glyph-name="_615" unicode="&#xf290;" horiz-adv-x="1792" 
+d="M1757 128l35 -313q3 -28 -16 -50q-19 -21 -48 -21h-1664q-29 0 -48 21q-19 22 -16 50l35 313h1722zM1664 967l86 -775h-1708l86 775q3 24 21 40.5t43 16.5h256v-128q0 -53 37.5 -90.5t90.5 -37.5t90.5 37.5t37.5 90.5v128h384v-128q0 -53 37.5 -90.5t90.5 -37.5
+t90.5 37.5t37.5 90.5v128h256q25 0 43 -16.5t21 -40.5zM1280 1152v-256q0 -26 -19 -45t-45 -19t-45 19t-19 45v256q0 106 -75 181t-181 75t-181 -75t-75 -181v-256q0 -26 -19 -45t-45 -19t-45 19t-19 45v256q0 159 112.5 271.5t271.5 112.5t271.5 -112.5t112.5 -271.5z" />
+    <glyph glyph-name="_616" unicode="&#xf291;" horiz-adv-x="2048" 
+d="M1920 768q53 0 90.5 -37.5t37.5 -90.5t-37.5 -90.5t-90.5 -37.5h-15l-115 -662q-8 -46 -44 -76t-82 -30h-1280q-46 0 -82 30t-44 76l-115 662h-15q-53 0 -90.5 37.5t-37.5 90.5t37.5 90.5t90.5 37.5h1792zM485 -32q26 2 43.5 22.5t15.5 46.5l-32 416q-2 26 -22.5 43.5
+t-46.5 15.5t-43.5 -22.5t-15.5 -46.5l32 -416q2 -25 20.5 -42t43.5 -17h5zM896 32v416q0 26 -19 45t-45 19t-45 -19t-19 -45v-416q0 -26 19 -45t45 -19t45 19t19 45zM1280 32v416q0 26 -19 45t-45 19t-45 -19t-19 -45v-416q0 -26 19 -45t45 -19t45 19t19 45zM1632 27l32 416
+q2 26 -15.5 46.5t-43.5 22.5t-46.5 -15.5t-22.5 -43.5l-32 -416q-2 -26 15.5 -46.5t43.5 -22.5h5q25 0 43.5 17t20.5 42zM476 1244l-93 -412h-132l101 441q19 88 89 143.5t160 55.5h167q0 26 19 45t45 19h384q26 0 45 -19t19 -45h167q90 0 160 -55.5t89 -143.5l101 -441
+h-132l-93 412q-11 44 -45.5 72t-79.5 28h-167q0 -26 -19 -45t-45 -19h-384q-26 0 -45 19t-19 45h-167q-45 0 -79.5 -28t-45.5 -72z" />
+    <glyph glyph-name="_617" unicode="&#xf292;" horiz-adv-x="1792" 
+d="M991 512l64 256h-254l-64 -256h254zM1759 1016l-56 -224q-7 -24 -31 -24h-327l-64 -256h311q15 0 25 -12q10 -14 6 -28l-56 -224q-5 -24 -31 -24h-327l-81 -328q-7 -24 -31 -24h-224q-16 0 -26 12q-9 12 -6 28l78 312h-254l-81 -328q-7 -24 -31 -24h-225q-15 0 -25 12
+q-9 12 -6 28l78 312h-311q-15 0 -25 12q-9 12 -6 28l56 224q7 24 31 24h327l64 256h-311q-15 0 -25 12q-10 14 -6 28l56 224q5 24 31 24h327l81 328q7 24 32 24h224q15 0 25 -12q9 -12 6 -28l-78 -312h254l81 328q7 24 32 24h224q15 0 25 -12q9 -12 6 -28l-78 -312h311
+q15 0 25 -12q9 -12 6 -28z" />
+    <glyph glyph-name="_618" unicode="&#xf293;" 
+d="M841 483l148 -148l-149 -149zM840 1094l149 -149l-148 -148zM710 -130l464 464l-306 306l306 306l-464 464v-611l-255 255l-93 -93l320 -321l-320 -321l93 -93l255 255v-611zM1429 640q0 -209 -32 -365.5t-87.5 -257t-140.5 -162.5t-181.5 -86.5t-219.5 -24.5
+t-219.5 24.5t-181.5 86.5t-140.5 162.5t-87.5 257t-32 365.5t32 365.5t87.5 257t140.5 162.5t181.5 86.5t219.5 24.5t219.5 -24.5t181.5 -86.5t140.5 -162.5t87.5 -257t32 -365.5z" />
+    <glyph glyph-name="_619" unicode="&#xf294;" horiz-adv-x="1024" 
+d="M596 113l173 172l-173 172v-344zM596 823l173 172l-173 172v-344zM628 640l356 -356l-539 -540v711l-297 -296l-108 108l372 373l-372 373l108 108l297 -296v711l539 -540z" />
+    <glyph glyph-name="_620" unicode="&#xf295;" 
+d="M1280 256q0 52 -38 90t-90 38t-90 -38t-38 -90t38 -90t90 -38t90 38t38 90zM512 1024q0 52 -38 90t-90 38t-90 -38t-38 -90t38 -90t90 -38t90 38t38 90zM1536 256q0 -159 -112.5 -271.5t-271.5 -112.5t-271.5 112.5t-112.5 271.5t112.5 271.5t271.5 112.5t271.5 -112.5
+t112.5 -271.5zM1440 1344q0 -20 -13 -38l-1056 -1408q-19 -26 -51 -26h-160q-26 0 -45 19t-19 45q0 20 13 38l1056 1408q19 26 51 26h160q26 0 45 -19t19 -45zM768 1024q0 -159 -112.5 -271.5t-271.5 -112.5t-271.5 112.5t-112.5 271.5t112.5 271.5t271.5 112.5
+t271.5 -112.5t112.5 -271.5z" />
+    <glyph glyph-name="_621" unicode="&#xf296;" horiz-adv-x="1792" 
+d="M104 830l792 -1015l-868 630q-18 13 -25 34.5t0 42.5l101 308v0zM566 830h660l-330 -1015v0zM368 1442l198 -612h-462l198 612q8 23 33 23t33 -23zM1688 830l101 -308q7 -21 0 -42.5t-25 -34.5l-868 -630l792 1015v0zM1688 830h-462l198 612q8 23 33 23t33 -23z" />
+    <glyph glyph-name="_622" unicode="&#xf297;" horiz-adv-x="1792" 
+d="M384 704h160v224h-160v-224zM1221 372v92q-104 -36 -243 -38q-135 -1 -259.5 46.5t-220.5 122.5l1 -96q88 -80 212 -128.5t272 -47.5q129 0 238 49zM640 704h640v224h-640v-224zM1792 736q0 -187 -99 -352q89 -102 89 -229q0 -157 -129.5 -268t-313.5 -111
+q-122 0 -225 52.5t-161 140.5q-19 -1 -57 -1t-57 1q-58 -88 -161 -140.5t-225 -52.5q-184 0 -313.5 111t-129.5 268q0 127 89 229q-99 165 -99 352q0 209 120 385.5t326.5 279.5t449.5 103t449.5 -103t326.5 -279.5t120 -385.5z" />
+    <glyph glyph-name="_623" unicode="&#xf298;" 
+d="M515 625v-128h-252v128h252zM515 880v-127h-252v127h252zM1273 369v-128h-341v128h341zM1273 625v-128h-672v128h672zM1273 880v-127h-672v127h672zM1408 20v1240q0 8 -6 14t-14 6h-32l-378 -256l-210 171l-210 -171l-378 256h-32q-8 0 -14 -6t-6 -14v-1240q0 -8 6 -14
+t14 -6h1240q8 0 14 6t6 14zM553 1130l185 150h-406zM983 1130l221 150h-406zM1536 1260v-1240q0 -62 -43 -105t-105 -43h-1240q-62 0 -105 43t-43 105v1240q0 62 43 105t105 43h1240q62 0 105 -43t43 -105z" />
+    <glyph glyph-name="_624" unicode="&#xf299;" horiz-adv-x="1792" 
+d="M896 720q-104 196 -160 278q-139 202 -347 318q-34 19 -70 36q-89 40 -94 32t34 -38l39 -31q62 -43 112.5 -93.5t94.5 -116.5t70.5 -113t70.5 -131q9 -17 13 -25q44 -84 84 -153t98 -154t115.5 -150t131 -123.5t148.5 -90.5q153 -66 154 -60q1 3 -49 37q-53 36 -81 57
+q-77 58 -179 211t-185 310zM549 177q-76 60 -132.5 125t-98 143.5t-71 154.5t-58.5 186t-52 209t-60.5 252t-76.5 289q273 0 497.5 -36t379 -92t271 -144.5t185.5 -172.5t110 -198.5t56 -199.5t12.5 -198.5t-9.5 -173t-20 -143.5t-13 -107l323 -327h-104l-281 285
+q-22 -2 -91.5 -14t-121.5 -19t-138 -6t-160.5 17t-167.5 59t-179 111z" />
+    <glyph glyph-name="_625" unicode="&#xf29a;" horiz-adv-x="1792" 
+d="M1374 879q-6 26 -28.5 39.5t-48.5 7.5q-261 -62 -401 -62t-401 62q-26 6 -48.5 -7.5t-28.5 -39.5t7.5 -48.5t39.5 -28.5q194 -46 303 -58q-2 -158 -15.5 -269t-26.5 -155.5t-41 -115.5l-9 -21q-10 -25 1 -49t36 -34q9 -4 23 -4q44 0 60 41l8 20q54 139 71 259h42
+q17 -120 71 -259l8 -20q16 -41 60 -41q14 0 23 4q25 10 36 34t1 49l-9 21q-28 71 -41 115.5t-26.5 155.5t-15.5 269q109 12 303 58q26 6 39.5 28.5t7.5 48.5zM1024 1024q0 53 -37.5 90.5t-90.5 37.5t-90.5 -37.5t-37.5 -90.5t37.5 -90.5t90.5 -37.5t90.5 37.5t37.5 90.5z
+M1600 640q0 -143 -55.5 -273.5t-150 -225t-225 -150t-273.5 -55.5t-273.5 55.5t-225 150t-150 225t-55.5 273.5t55.5 273.5t150 225t225 150t273.5 55.5t273.5 -55.5t225 -150t150 -225t55.5 -273.5zM896 1408q-156 0 -298 -61t-245 -164t-164 -245t-61 -298t61 -298
+t164 -245t245 -164t298 -61t298 61t245 164t164 245t61 298t-61 298t-164 245t-245 164t-298 61zM1792 640q0 -182 -71 -348t-191 -286t-286 -191t-348 -71t-348 71t-286 191t-191 286t-71 348t71 348t191 286t286 191t348 71t348 -71t286 -191t191 -286t71 -348z" />
+    <glyph glyph-name="_626" unicode="&#xf29b;" 
+d="M1438 723q34 -35 29 -82l-44 -551q-4 -42 -34.5 -70t-71.5 -28q-6 0 -9 1q-44 3 -72.5 36.5t-25.5 77.5l35 429l-143 -8q55 -113 55 -240q0 -216 -148 -372l-137 137q91 101 91 235q0 145 -102.5 248t-247.5 103q-134 0 -236 -92l-137 138q120 114 284 141l264 300
+l-149 87l-181 -161q-33 -30 -77 -27.5t-73 35.5t-26.5 77t34.5 73l239 213q26 23 60 26.5t64 -14.5l488 -283q36 -21 48 -68q17 -67 -26 -117l-205 -232l371 20q49 3 83 -32zM1240 1180q-74 0 -126 52t-52 126t52 126t126 52t126.5 -52t52.5 -126t-52.5 -126t-126.5 -52z
+M613 -62q106 0 196 61l139 -139q-146 -116 -335 -116q-148 0 -273.5 73t-198.5 198t-73 273q0 188 116 336l139 -139q-60 -88 -60 -197q0 -145 102.5 -247.5t247.5 -102.5z" />
+    <glyph glyph-name="_627" unicode="&#xf29c;" 
+d="M880 336v-160q0 -14 -9 -23t-23 -9h-160q-14 0 -23 9t-9 23v160q0 14 9 23t23 9h160q14 0 23 -9t9 -23zM1136 832q0 -50 -15 -90t-45.5 -69t-52 -44t-59.5 -36q-32 -18 -46.5 -28t-26 -24t-11.5 -29v-32q0 -14 -9 -23t-23 -9h-160q-14 0 -23 9t-9 23v68q0 35 10.5 64.5
+t24 47.5t39 35.5t41 25.5t44.5 21q53 25 75 43t22 49q0 42 -43.5 71.5t-95.5 29.5q-56 0 -95 -27q-29 -20 -80 -83q-9 -12 -25 -12q-11 0 -19 6l-108 82q-10 7 -12 20t5 23q122 192 349 192q129 0 238.5 -89.5t109.5 -214.5zM768 1280q-130 0 -248.5 -51t-204 -136.5
+t-136.5 -204t-51 -248.5t51 -248.5t136.5 -204t204 -136.5t248.5 -51t248.5 51t204 136.5t136.5 204t51 248.5t-51 248.5t-136.5 204t-204 136.5t-248.5 51zM1536 640q0 -209 -103 -385.5t-279.5 -279.5t-385.5 -103t-385.5 103t-279.5 279.5t-103 385.5t103 385.5
+t279.5 279.5t385.5 103t385.5 -103t279.5 -279.5t103 -385.5z" />
+    <glyph glyph-name="_628" unicode="&#xf29d;" horiz-adv-x="1408" 
+d="M366 1225q-64 0 -110 45.5t-46 110.5q0 64 46 109.5t110 45.5t109.5 -45.5t45.5 -109.5q0 -65 -45.5 -110.5t-109.5 -45.5zM917 583q0 -50 -30 -67.5t-63.5 -6.5t-47.5 34l-367 438q-7 12 -14 15.5t-11 1.5l-3 -3q-7 -8 4 -21l122 -139l1 -354l-161 -457
+q-67 -192 -92 -234q-15 -26 -28 -32q-50 -26 -103 -1q-29 13 -41.5 43t-9.5 57q2 17 197 618l5 416l-85 -164l35 -222q4 -24 -1 -42t-14 -27.5t-19 -16t-17 -7.5l-7 -2q-19 -3 -34.5 3t-24 16t-14 22t-7.5 19.5t-2 9.5l-46 299l211 381q23 34 113 34q75 0 107 -40l424 -521
+q7 -5 14 -17l3 -3l-1 -1q7 -13 7 -29zM514 433q43 -113 88.5 -225t69.5 -168l24 -55q36 -93 42 -125q11 -70 -36 -97q-35 -22 -66 -16t-51 22t-29 35h-1q-6 16 -8 25l-124 351zM1338 -159q31 -49 31 -57q0 -5 -3 -7q-9 -5 -14.5 0.5t-15.5 26t-16 30.5q-114 172 -423 661
+q3 -1 7 1t7 4l3 2q11 9 11 17z" />
+    <glyph glyph-name="_629" unicode="&#xf29e;" horiz-adv-x="2304" 
+d="M504 542h171l-1 265zM1530 641q0 87 -50.5 140t-146.5 53h-54v-388h52q91 0 145 57t54 138zM956 1018l1 -756q0 -14 -9.5 -24t-23.5 -10h-216q-14 0 -23.5 10t-9.5 24v62h-291l-55 -81q-10 -15 -28 -15h-267q-21 0 -30.5 18t3.5 35l556 757q9 14 27 14h332q14 0 24 -10
+t10 -24zM1783 641q0 -193 -125.5 -303t-324.5 -110h-270q-14 0 -24 10t-10 24v756q0 14 10 24t24 10h268q200 0 326 -109t126 -302zM1939 640q0 -11 -0.5 -29t-8 -71.5t-21.5 -102t-44.5 -108t-73.5 -102.5h-51q38 45 66.5 104.5t41.5 112t21 98t9 72.5l1 27q0 8 -0.5 22.5
+t-7.5 60t-20 91.5t-41 111.5t-66 124.5h43q41 -47 72 -107t45.5 -111.5t23 -96t10.5 -70.5zM2123 640q0 -11 -0.5 -29t-8 -71.5t-21.5 -102t-45 -108t-74 -102.5h-51q38 45 66.5 104.5t41.5 112t21 98t9 72.5l1 27q0 8 -0.5 22.5t-7.5 60t-19.5 91.5t-40.5 111.5t-66 124.5
+h43q41 -47 72 -107t45.5 -111.5t23 -96t10.5 -70.5zM2304 640q0 -11 -0.5 -29t-8 -71.5t-21.5 -102t-44.5 -108t-73.5 -102.5h-51q38 45 66 104.5t41 112t21 98t9 72.5l1 27q0 8 -0.5 22.5t-7.5 60t-19.5 91.5t-40.5 111.5t-66 124.5h43q41 -47 72 -107t45.5 -111.5t23 -96
+t9.5 -70.5z" />
+    <glyph glyph-name="uniF2A0" unicode="&#xf2a0;" horiz-adv-x="1408" 
+d="M617 -153q0 11 -13 58t-31 107t-20 69q-1 4 -5 26.5t-8.5 36t-13.5 21.5q-15 14 -51 14q-23 0 -70 -5.5t-71 -5.5q-34 0 -47 11q-6 5 -11 15.5t-7.5 20t-6.5 24t-5 18.5q-37 128 -37 255t37 255q1 4 5 18.5t6.5 24t7.5 20t11 15.5q13 11 47 11q24 0 71 -5.5t70 -5.5
+q36 0 51 14q9 8 13.5 21.5t8.5 36t5 26.5q2 9 20 69t31 107t13 58q0 22 -43.5 52.5t-75.5 42.5q-20 8 -45 8q-34 0 -98 -18q-57 -17 -96.5 -40.5t-71 -66t-46 -70t-45.5 -94.5q-6 -12 -9 -19q-49 -107 -68 -216t-19 -244t19 -244t68 -216q56 -122 83 -161q63 -91 179 -127
+l6 -2q64 -18 98 -18q25 0 45 8q32 12 75.5 42.5t43.5 52.5zM776 760q-26 0 -45 19t-19 45.5t19 45.5q37 37 37 90q0 52 -37 91q-19 19 -19 45t19 45t45 19t45 -19q75 -75 75 -181t-75 -181q-21 -19 -45 -19zM957 579q-27 0 -45 19q-19 19 -19 45t19 45q112 114 112 272
+t-112 272q-19 19 -19 45t19 45t45 19t45 -19q150 -150 150 -362t-150 -362q-18 -19 -45 -19zM1138 398q-27 0 -45 19q-19 19 -19 45t19 45q90 91 138.5 208t48.5 245t-48.5 245t-138.5 208q-19 19 -19 45t19 45t45 19t45 -19q109 -109 167 -249t58 -294t-58 -294t-167 -249
+q-18 -19 -45 -19z" />
+    <glyph glyph-name="uniF2A1" unicode="&#xf2a1;" horiz-adv-x="2176" 
+d="M192 352q-66 0 -113 -47t-47 -113t47 -113t113 -47t113 47t47 113t-47 113t-113 47zM704 352q-66 0 -113 -47t-47 -113t47 -113t113 -47t113 47t47 113t-47 113t-113 47zM704 864q-66 0 -113 -47t-47 -113t47 -113t113 -47t113 47t47 113t-47 113t-113 47zM1472 352
+q-66 0 -113 -47t-47 -113t47 -113t113 -47t113 47t47 113t-47 113t-113 47zM1984 352q-66 0 -113 -47t-47 -113t47 -113t113 -47t113 47t47 113t-47 113t-113 47zM1472 864q-66 0 -113 -47t-47 -113t47 -113t113 -47t113 47t47 113t-47 113t-113 47zM1984 864
+q-66 0 -113 -47t-47 -113t47 -113t113 -47t113 47t47 113t-47 113t-113 47zM1984 1376q-66 0 -113 -47t-47 -113t47 -113t113 -47t113 47t47 113t-47 113t-113 47zM384 192q0 -80 -56 -136t-136 -56t-136 56t-56 136t56 136t136 56t136 -56t56 -136zM896 192q0 -80 -56 -136
+t-136 -56t-136 56t-56 136t56 136t136 56t136 -56t56 -136zM384 704q0 -80 -56 -136t-136 -56t-136 56t-56 136t56 136t136 56t136 -56t56 -136zM896 704q0 -80 -56 -136t-136 -56t-136 56t-56 136t56 136t136 56t136 -56t56 -136zM384 1216q0 -80 -56 -136t-136 -56
+t-136 56t-56 136t56 136t136 56t136 -56t56 -136zM1664 192q0 -80 -56 -136t-136 -56t-136 56t-56 136t56 136t136 56t136 -56t56 -136zM896 1216q0 -80 -56 -136t-136 -56t-136 56t-56 136t56 136t136 56t136 -56t56 -136zM2176 192q0 -80 -56 -136t-136 -56t-136 56
+t-56 136t56 136t136 56t136 -56t56 -136zM1664 704q0 -80 -56 -136t-136 -56t-136 56t-56 136t56 136t136 56t136 -56t56 -136zM2176 704q0 -80 -56 -136t-136 -56t-136 56t-56 136t56 136t136 56t136 -56t56 -136zM1664 1216q0 -80 -56 -136t-136 -56t-136 56t-56 136
+t56 136t136 56t136 -56t56 -136zM2176 1216q0 -80 -56 -136t-136 -56t-136 56t-56 136t56 136t136 56t136 -56t56 -136z" />
+    <glyph glyph-name="uniF2A2" unicode="&#xf2a2;" horiz-adv-x="1792" 
+d="M128 -192q0 -26 -19 -45t-45 -19t-45 19t-19 45t19 45t45 19t45 -19t19 -45zM320 0q0 -26 -19 -45t-45 -19t-45 19t-19 45t19 45t45 19t45 -19t19 -45zM365 365l256 -256l-90 -90l-256 256zM704 384q0 -26 -19 -45t-45 -19t-45 19t-19 45t19 45t45 19t45 -19t19 -45z
+M1411 704q0 -59 -11.5 -108.5t-37.5 -93.5t-44 -67.5t-53 -64.5q-31 -35 -45.5 -54t-33.5 -50t-26.5 -64t-7.5 -74q0 -159 -112.5 -271.5t-271.5 -112.5q-26 0 -45 19t-19 45t19 45t45 19q106 0 181 75t75 181q0 57 11.5 105.5t37 91t43.5 66.5t52 63q40 46 59.5 72
+t37.5 74.5t18 103.5q0 185 -131.5 316.5t-316.5 131.5t-316.5 -131.5t-131.5 -316.5q0 -26 -19 -45t-45 -19t-45 19t-19 45q0 117 45.5 223.5t123 184t184 123t223.5 45.5t223.5 -45.5t184 -123t123 -184t45.5 -223.5zM896 576q0 -26 -19 -45t-45 -19t-45 19t-19 45t19 45
+t45 19t45 -19t19 -45zM1184 704q0 -26 -19 -45t-45 -19t-45 19t-19 45q0 93 -65.5 158.5t-158.5 65.5q-92 0 -158 -65.5t-66 -158.5q0 -26 -19 -45t-45 -19t-45 19t-19 45q0 146 103 249t249 103t249 -103t103 -249zM1578 993q10 -25 -1 -49t-36 -34q-9 -4 -23 -4
+q-19 0 -35.5 11t-23.5 30q-68 178 -224 295q-21 16 -25 42t12 47q17 21 43 25t47 -12q183 -137 266 -351zM1788 1074q9 -25 -1.5 -49t-35.5 -34q-11 -4 -23 -4q-44 0 -60 41q-92 238 -297 393q-22 16 -25.5 42t12.5 47q16 22 42 25.5t47 -12.5q235 -175 341 -449z" />
+    <glyph glyph-name="uniF2A3" unicode="&#xf2a3;" horiz-adv-x="2304" 
+d="M1032 576q-59 2 -84 55q-17 34 -48 53.5t-68 19.5q-53 0 -90.5 -37.5t-37.5 -90.5q0 -56 36 -89l10 -8q34 -31 82 -31q37 0 68 19.5t48 53.5q25 53 84 55zM1600 704q0 56 -36 89l-10 8q-34 31 -82 31q-37 0 -68 -19.5t-48 -53.5q-25 -53 -84 -55q59 -2 84 -55
+q17 -34 48 -53.5t68 -19.5q53 0 90.5 37.5t37.5 90.5zM1174 925q-17 -35 -55 -48t-73 4q-62 31 -134 31q-51 0 -99 -17q3 0 9.5 0.5t9.5 0.5q92 0 170.5 -50t118.5 -133q17 -36 3.5 -73.5t-49.5 -54.5q-18 -9 -39 -9q21 0 39 -9q36 -17 49.5 -54.5t-3.5 -73.5
+q-40 -83 -118.5 -133t-170.5 -50h-6q-16 2 -44 4l-290 27l-239 -120q-14 -7 -29 -7q-40 0 -57 35l-160 320q-11 23 -4 47.5t29 37.5l209 119l148 267q17 155 91.5 291.5t195.5 236.5q31 25 70.5 21.5t64.5 -34.5t21.5 -70t-34.5 -65q-70 -59 -117 -128q123 84 267 101
+q40 5 71.5 -19t35.5 -64q5 -40 -19 -71.5t-64 -35.5q-84 -10 -159 -55q46 10 99 10q115 0 218 -50q36 -18 49 -55.5t-5 -73.5zM2137 1085l160 -320q11 -23 4 -47.5t-29 -37.5l-209 -119l-148 -267q-17 -155 -91.5 -291.5t-195.5 -236.5q-26 -22 -61 -22q-45 0 -74 35
+q-25 31 -21.5 70t34.5 65q70 59 117 128q-123 -84 -267 -101q-4 -1 -12 -1q-36 0 -63.5 24t-31.5 60q-5 40 19 71.5t64 35.5q84 10 159 55q-46 -10 -99 -10q-115 0 -218 50q-36 18 -49 55.5t5 73.5q17 35 55 48t73 -4q62 -31 134 -31q51 0 99 17q-3 0 -9.5 -0.5t-9.5 -0.5
+q-92 0 -170.5 50t-118.5 133q-17 36 -3.5 73.5t49.5 54.5q18 9 39 9q-21 0 -39 9q-36 17 -49.5 54.5t3.5 73.5q40 83 118.5 133t170.5 50h6h1q14 -2 42 -4l291 -27l239 120q14 7 29 7q40 0 57 -35z" />
+    <glyph glyph-name="uniF2A4" unicode="&#xf2a4;" horiz-adv-x="1792" 
+d="M1056 704q0 -26 19 -45t45 -19t45 19t19 45q0 146 -103 249t-249 103t-249 -103t-103 -249q0 -26 19 -45t45 -19t45 19t19 45q0 93 66 158.5t158 65.5t158 -65.5t66 -158.5zM835 1280q-117 0 -223.5 -45.5t-184 -123t-123 -184t-45.5 -223.5q0 -26 19 -45t45 -19t45 19
+t19 45q0 185 131.5 316.5t316.5 131.5t316.5 -131.5t131.5 -316.5q0 -55 -18 -103.5t-37.5 -74.5t-59.5 -72q-34 -39 -52 -63t-43.5 -66.5t-37 -91t-11.5 -105.5q0 -106 -75 -181t-181 -75q-26 0 -45 -19t-19 -45t19 -45t45 -19q159 0 271.5 112.5t112.5 271.5q0 41 7.5 74
+t26.5 64t33.5 50t45.5 54q35 41 53 64.5t44 67.5t37.5 93.5t11.5 108.5q0 117 -45.5 223.5t-123 184t-184 123t-223.5 45.5zM591 561l226 -226l-579 -579q-12 -12 -29 -12t-29 12l-168 168q-12 12 -12 29t12 29zM1612 1524l168 -168q12 -12 12 -29t-12 -30l-233 -233
+l-26 -25l-71 -71q-66 153 -195 258l91 91l207 207q13 12 30 12t29 -12z" />
+    <glyph glyph-name="uniF2A5" unicode="&#xf2a5;" 
+d="M866 1021q0 -27 -13 -94q-11 -50 -31.5 -150t-30.5 -150q-2 -11 -4.5 -12.5t-13.5 -2.5q-20 -2 -31 -2q-58 0 -84 49.5t-26 113.5q0 88 35 174t103 124q28 14 51 14q28 0 36.5 -16.5t8.5 -47.5zM1352 597q0 14 -39 75.5t-52 66.5q-21 8 -34 8q-91 0 -226 -77l-2 2
+q3 22 27.5 135t24.5 178q0 233 -242 233q-24 0 -68 -6q-94 -17 -168.5 -89.5t-111.5 -166.5t-37 -189q0 -146 80.5 -225t227.5 -79q25 0 25 -3t-1 -5q-4 -34 -26 -117q-14 -52 -51.5 -101t-82.5 -49q-42 0 -42 47q0 24 10.5 47.5t25 39.5t29.5 28.5t26 20t11 8.5q0 3 -7 10
+q-24 22 -58.5 36.5t-65.5 14.5q-35 0 -63.5 -34t-41 -75t-12.5 -75q0 -88 51.5 -142t138.5 -54q82 0 155 53t117.5 126t65.5 153q6 22 15.5 66.5t14.5 66.5q3 12 14 18q118 60 227 60q48 0 127 -18q1 -1 4 -1q5 0 9.5 4.5t4.5 8.5zM1536 1120v-960q0 -119 -84.5 -203.5
+t-203.5 -84.5h-960q-119 0 -203.5 84.5t-84.5 203.5v960q0 119 84.5 203.5t203.5 84.5h960q119 0 203.5 -84.5t84.5 -203.5z" />
+    <glyph glyph-name="uniF2A6" unicode="&#xf2a6;" horiz-adv-x="1535" 
+d="M744 1231q0 24 -2 38.5t-8.5 30t-21 23t-37.5 7.5q-39 0 -78 -23q-105 -58 -159 -190.5t-54 -269.5q0 -44 8.5 -85.5t26.5 -80.5t52.5 -62.5t81.5 -23.5q4 0 18 -0.5t20 0t16 3t15 8.5t7 16q16 77 48 231.5t48 231.5q19 91 19 146zM1498 575q0 -7 -7.5 -13.5t-15.5 -6.5
+l-6 1q-22 3 -62 11t-72 12.5t-63 4.5q-167 0 -351 -93q-15 -8 -21 -27q-10 -36 -24.5 -105.5t-22.5 -100.5q-23 -91 -70 -179.5t-112.5 -164.5t-154.5 -123t-185 -47q-135 0 -214.5 83.5t-79.5 219.5q0 53 19.5 117t63 116.5t97.5 52.5q38 0 120 -33.5t83 -61.5
+q0 -1 -16.5 -12.5t-39.5 -31t-46 -44.5t-39 -61t-16 -74q0 -33 16.5 -53t48.5 -20q45 0 85 31.5t66.5 78t48 105.5t32.5 107t16 90v9q0 2 -3.5 3.5t-8.5 1.5h-10t-10 -0.5t-6 -0.5q-227 0 -352 122.5t-125 348.5q0 108 34.5 221t96 210t156 167.5t204.5 89.5q52 9 106 9
+q374 0 374 -360q0 -98 -38 -273t-43 -211l3 -3q101 57 182.5 88t167.5 31q22 0 53 -13q19 -7 80 -102.5t61 -116.5z" />
+    <glyph glyph-name="uniF2A7" unicode="&#xf2a7;" horiz-adv-x="1664" 
+d="M831 863q32 0 59 -18l222 -148q61 -40 110 -97l146 -170q40 -46 29 -106l-72 -413q-6 -32 -29.5 -53.5t-55.5 -25.5l-527 -56l-352 -32h-9q-39 0 -67.5 28t-28.5 68q0 37 27 64t65 32l260 32h-448q-41 0 -69.5 30t-26.5 71q2 39 32 65t69 26l442 1l-521 64q-41 5 -66 37
+t-19 73q6 35 34.5 57.5t65.5 22.5h10l481 -60l-351 94q-38 10 -62 41.5t-18 68.5q6 36 33 58.5t62 22.5q6 0 20 -2l448 -96l217 -37q1 0 3 -0.5t3 -0.5q23 0 30.5 23t-12.5 36l-186 125q-35 23 -42 63.5t18 73.5q27 38 76 38zM761 661l186 -125l-218 37l-5 2l-36 38
+l-238 262q-1 1 -2.5 3.5t-2.5 3.5q-24 31 -18.5 70t37.5 64q31 23 68 17.5t64 -33.5l142 -147q-2 -1 -5 -3.5t-4 -4.5q-32 -45 -23 -99t55 -85zM1648 1115l15 -266q4 -73 -11 -147l-48 -219q-12 -59 -67 -87l-106 -54q2 62 -39 109l-146 170q-53 61 -117 103l-222 148
+q-34 23 -76 23q-51 0 -88 -37l-235 312q-25 33 -18 73.5t41 63.5q33 22 71.5 14t62.5 -40l266 -352l-262 455q-21 35 -10.5 75t47.5 59q35 18 72.5 6t57.5 -46l241 -420l-136 337q-15 35 -4.5 74t44.5 56q37 19 76 6t56 -51l193 -415l101 -196q8 -15 23 -17.5t27 7.5t11 26
+l-12 224q-2 41 26 71t69 31q39 0 67 -28.5t30 -67.5z" />
+    <glyph glyph-name="uniF2A8" unicode="&#xf2a8;" horiz-adv-x="1792" 
+d="M335 180q-2 0 -6 2q-86 57 -168.5 145t-139.5 180q-21 30 -21 69q0 9 2 19t4 18t7 18t8.5 16t10.5 17t10 15t12 15.5t11 14.5q184 251 452 365q-110 198 -110 211q0 19 17 29q116 64 128 64q18 0 28 -16l124 -229q92 19 192 19q266 0 497.5 -137.5t378.5 -369.5
+q20 -31 20 -69t-20 -69q-91 -142 -218.5 -253.5t-278.5 -175.5q110 -198 110 -211q0 -20 -17 -29q-116 -64 -127 -64q-19 0 -29 16l-124 229l-64 119l-444 820l7 7q-58 -24 -99 -47q3 -5 127 -234t243 -449t119 -223q0 -7 -9 -9q-13 -3 -72 -3q-57 0 -60 7l-456 841
+q-39 -28 -82 -68q24 -43 214 -393.5t190 -354.5q0 -10 -11 -10q-14 0 -82.5 22t-72.5 28l-106 197l-224 413q-44 -53 -78 -106q2 -3 18 -25t23 -34l176 -327q0 -10 -10 -10zM1165 282l49 -91q273 111 450 385q-180 277 -459 389q67 -64 103 -148.5t36 -176.5
+q0 -106 -47 -200.5t-132 -157.5zM848 896q0 -20 14 -34t34 -14q86 0 147 -61t61 -147q0 -20 14 -34t34 -14t34 14t14 34q0 126 -89 215t-215 89q-20 0 -34 -14t-14 -34zM1214 961l-9 4l7 -7z" />
+    <glyph glyph-name="uniF2A9" unicode="&#xf2a9;" horiz-adv-x="1280" 
+d="M1050 430q0 -215 -147 -374q-148 -161 -378 -161q-232 0 -378 161q-147 159 -147 374q0 147 68 270.5t189 196.5t268 73q96 0 182 -31q-32 -62 -39 -126q-66 28 -143 28q-167 0 -280.5 -123t-113.5 -291q0 -170 112.5 -288.5t281.5 -118.5t281 118.5t112 288.5
+q0 89 -32 166q66 13 123 49q41 -98 41 -212zM846 619q0 -192 -79.5 -345t-238.5 -253l-14 -1q-29 0 -62 5q83 32 146.5 102.5t99.5 154.5t58.5 189t30 192.5t7.5 178.5q0 69 -3 103q55 -160 55 -326zM791 947v-2q-73 214 -206 440q88 -59 142.5 -186.5t63.5 -251.5z
+M1035 744q-83 0 -160 75q218 120 290 247q19 37 21 56q-42 -94 -139.5 -166.5t-204.5 -97.5q-35 54 -35 113q0 37 17 79t43 68q46 44 157 74q59 16 106 58.5t74 100.5q74 -105 74 -253q0 -109 -24 -170q-32 -77 -88.5 -130.5t-130.5 -53.5z" />
+    <glyph glyph-name="uniF2AA" unicode="&#xf2aa;" 
+d="M1050 495q0 78 -28 147q-41 -25 -85 -34q22 -50 22 -114q0 -117 -77 -198.5t-193 -81.5t-193.5 81.5t-77.5 198.5q0 115 78 199.5t193 84.5q53 0 98 -19q4 43 27 87q-60 21 -125 21q-154 0 -257.5 -108.5t-103.5 -263.5t103.5 -261t257.5 -106t257.5 106.5t103.5 260.5z
+M872 850q2 -24 2 -71q0 -63 -5 -123t-20.5 -132.5t-40.5 -130t-68.5 -106t-100.5 -70.5q21 -3 42 -3h10q219 139 219 411q0 116 -38 225zM872 850q-4 80 -44 171.5t-98 130.5q92 -156 142 -302zM1207 955q0 102 -51 174q-41 -86 -124 -109q-69 -19 -109 -53.5t-40 -99.5
+q0 -40 24 -77q74 17 140.5 67t95.5 115q-4 -52 -74.5 -111.5t-138.5 -97.5q52 -52 110 -52q51 0 90 37t60 90q17 42 17 117zM1536 1120v-960q0 -119 -84.5 -203.5t-203.5 -84.5h-960q-119 0 -203.5 84.5t-84.5 203.5v960q0 119 84.5 203.5t203.5 84.5h960q119 0 203.5 -84.5
+t84.5 -203.5z" />
+    <glyph glyph-name="uniF2AB" unicode="&#xf2ab;" 
+d="M1279 388q0 22 -22 27q-67 15 -118 59t-80 108q-7 19 -7 25q0 15 19.5 26t43 17t43 20.5t19.5 36.5q0 19 -18.5 31.5t-38.5 12.5q-12 0 -32 -8t-31 -8q-4 0 -12 2q5 95 5 114q0 79 -17 114q-36 78 -103 121.5t-152 43.5q-199 0 -275 -165q-17 -35 -17 -114q0 -19 5 -114
+q-4 -2 -14 -2q-12 0 -32 7.5t-30 7.5q-21 0 -38.5 -12t-17.5 -32q0 -21 19.5 -35.5t43 -20.5t43 -17t19.5 -26q0 -6 -7 -25q-64 -138 -198 -167q-22 -5 -22 -27q0 -46 137 -68q2 -5 6 -26t11.5 -30.5t23.5 -9.5q12 0 37.5 4.5t39.5 4.5q35 0 67 -15t54 -32.5t57.5 -32.5
+t76.5 -15q43 0 79 15t57.5 32.5t53.5 32.5t67 15q14 0 39.5 -4t38.5 -4q16 0 23 10t11 30t6 25q137 22 137 68zM1536 640q0 -209 -103 -385.5t-279.5 -279.5t-385.5 -103t-385.5 103t-279.5 279.5t-103 385.5t103 385.5t279.5 279.5t385.5 103t385.5 -103t279.5 -279.5
+t103 -385.5z" />
+    <glyph glyph-name="uniF2AC" unicode="&#xf2ac;" horiz-adv-x="1664" 
+d="M848 1408q134 1 240.5 -68.5t163.5 -192.5q27 -58 27 -179q0 -47 -9 -191q14 -7 28 -7q18 0 51 13.5t51 13.5q29 0 56 -18t27 -46q0 -32 -31.5 -54t-69 -31.5t-69 -29t-31.5 -47.5q0 -15 12 -43q37 -82 102.5 -150t144.5 -101q28 -12 80 -23q28 -6 28 -35
+q0 -70 -219 -103q-7 -11 -11 -39t-14 -46.5t-33 -18.5q-20 0 -62 6.5t-64 6.5q-37 0 -62 -5q-32 -5 -63 -22.5t-58 -38t-58 -40.5t-76 -33.5t-99 -13.5q-52 0 -96.5 13.5t-75 33.5t-57.5 40.5t-58 38t-62 22.5q-26 5 -63 5q-24 0 -65.5 -7.5t-58.5 -7.5q-25 0 -35 18.5
+t-14 47.5t-11 40q-219 33 -219 103q0 29 28 35q52 11 80 23q78 32 144.5 101t102.5 150q12 28 12 43q0 28 -31.5 47.5t-69.5 29.5t-69.5 31.5t-31.5 52.5q0 27 26 45.5t55 18.5q15 0 48 -13t53 -13q18 0 32 7q-9 142 -9 190q0 122 27 180q64 137 172 198t264 63z" />
+    <glyph glyph-name="uniF2AD" unicode="&#xf2ad;" 
+d="M1280 388q0 22 -22 27q-67 14 -118 58t-80 109q-7 14 -7 25q0 15 19.5 26t42.5 17t42.5 20.5t19.5 36.5q0 19 -18.5 31.5t-38.5 12.5q-11 0 -31 -8t-32 -8q-4 0 -12 2q5 63 5 115q0 78 -17 114q-36 78 -102.5 121.5t-152.5 43.5q-198 0 -275 -165q-18 -38 -18 -115
+q0 -38 6 -114q-10 -2 -15 -2q-11 0 -31.5 8t-30.5 8q-20 0 -37.5 -12.5t-17.5 -32.5q0 -21 19.5 -35.5t42.5 -20.5t42.5 -17t19.5 -26q0 -11 -7 -25q-64 -138 -198 -167q-22 -5 -22 -27q0 -47 138 -69q2 -5 6 -26t11 -30.5t23 -9.5q13 0 38.5 5t38.5 5q35 0 67.5 -15
+t54.5 -32.5t57.5 -32.5t76.5 -15q43 0 79 15t57.5 32.5t54 32.5t67.5 15q13 0 39 -4.5t39 -4.5q15 0 22.5 9.5t11.5 31t5 24.5q138 22 138 69zM1536 1120v-960q0 -119 -84.5 -203.5t-203.5 -84.5h-960q-119 0 -203.5 84.5t-84.5 203.5v960q0 119 84.5 203.5t203.5 84.5h960
+q119 0 203.5 -84.5t84.5 -203.5z" />
+    <glyph glyph-name="uniF2AE" unicode="&#xf2ae;" horiz-adv-x="2304" 
+d="M2304 1536q-69 -46 -125 -92t-89 -81t-59.5 -71.5t-37.5 -57.5t-22 -44.5t-14 -29.5q-10 -18 -35.5 -136.5t-48.5 -164.5q-15 -29 -50 -60.5t-67.5 -50.5t-72.5 -41t-48 -28q-47 -31 -151 -231q-341 14 -630 -158q-92 -53 -303 -179q47 16 86 31t55 22l15 7
+q71 27 163 64.5t133.5 53.5t108 34.5t142.5 31.5q186 31 465 -7q1 0 10 -3q11 -6 14 -17t-3 -22l-194 -345q-15 -29 -47 -22q-128 24 -354 24q-146 0 -402 -44.5t-392 -46.5q-82 -1 -149 13t-107 37t-61 40t-33 34l-1 1v2q0 6 6 6q138 0 371 55q192 366 374.5 524t383.5 158
+q5 0 14.5 -0.5t38 -5t55 -12t61.5 -24.5t63 -39.5t54 -59t40 -82.5l102 177q2 4 21 42.5t44.5 86.5t61 109.5t84 133.5t100.5 137q66 82 128 141.5t121.5 96.5t92.5 53.5t88 39.5z" />
+    <glyph glyph-name="uniF2B0" unicode="&#xf2b0;" 
+d="M1322 640q0 -45 -5 -76l-236 14l224 -78q-19 -73 -58 -141l-214 103l177 -158q-44 -61 -107 -108l-157 178l103 -215q-61 -37 -140 -59l-79 228l14 -240q-38 -6 -76 -6t-76 6l14 238l-78 -226q-74 19 -140 59l103 215l-157 -178q-59 43 -108 108l178 158l-214 -104
+q-39 69 -58 141l224 79l-237 -14q-5 42 -5 76q0 35 5 77l238 -14l-225 79q19 73 58 140l214 -104l-177 159q46 61 107 108l158 -178l-103 215q67 39 140 58l77 -224l-13 236q36 6 75 6q38 0 76 -6l-14 -237l78 225q74 -19 140 -59l-103 -214l158 178q61 -47 107 -108
+l-177 -159l213 104q37 -62 58 -141l-224 -78l237 14q5 -31 5 -77zM1352 640q0 160 -78.5 295.5t-213 214t-292.5 78.5q-119 0 -227 -46.5t-186.5 -125t-124.5 -187.5t-46 -229q0 -119 46 -228t124.5 -187.5t186.5 -125t227 -46.5q158 0 292.5 78.5t213 214t78.5 294.5z
+M1425 1023v-766l-657 -383l-657 383v766l657 383zM768 -183l708 412v823l-708 411l-708 -411v-823zM1536 1088v-896l-768 -448l-768 448v896l768 448z" />
+    <glyph glyph-name="uniF2B1" unicode="&#xf2b1;" horiz-adv-x="1664" 
+d="M339 1318h691l-26 -72h-665q-110 0 -188.5 -79t-78.5 -189v-771q0 -95 60.5 -169.5t153.5 -93.5q23 -5 98 -5v-72h-45q-140 0 -239.5 100t-99.5 240v771q0 140 99.5 240t239.5 100zM1190 1536h247l-482 -1294q-23 -61 -40.5 -103.5t-45 -98t-54 -93.5t-64.5 -78.5
+t-79.5 -65t-95.5 -41t-116 -18.5v195q163 26 220 182q20 52 20 105q0 54 -20 106l-285 733h228l187 -585zM1664 978v-1111h-795q37 55 45 73h678v1038q0 85 -49.5 155t-129.5 99l25 67q101 -34 163.5 -123.5t62.5 -197.5z" />
+    <glyph glyph-name="uniF2B2" unicode="&#xf2b2;" horiz-adv-x="1792" 
+d="M852 1227q0 -29 -17 -52.5t-45 -23.5t-45 23.5t-17 52.5t17 52.5t45 23.5t45 -23.5t17 -52.5zM688 -149v114q0 30 -20.5 51.5t-50.5 21.5t-50 -21.5t-20 -51.5v-114q0 -30 20.5 -52t49.5 -22q30 0 50.5 22t20.5 52zM860 -149v114q0 30 -20 51.5t-50 21.5t-50.5 -21.5
+t-20.5 -51.5v-114q0 -30 20.5 -52t50.5 -22q29 0 49.5 22t20.5 52zM1034 -149v114q0 30 -20.5 51.5t-50.5 21.5t-50.5 -21.5t-20.5 -51.5v-114q0 -30 20.5 -52t50.5 -22t50.5 22t20.5 52zM1208 -149v114q0 30 -20.5 51.5t-50.5 21.5t-50.5 -21.5t-20.5 -51.5v-114
+q0 -30 20.5 -52t50.5 -22t50.5 22t20.5 52zM1476 535q-84 -160 -232 -259.5t-323 -99.5q-123 0 -229.5 51.5t-178.5 137t-113 197.5t-41 232q0 88 21 174q-104 -175 -104 -390q0 -162 65 -312t185 -251q30 57 91 57q56 0 86 -50q32 50 87 50q56 0 86 -50q32 50 87 50t87 -50
+q30 50 86 50q28 0 52.5 -15.5t37.5 -40.5q112 94 177 231.5t73 287.5zM1326 564q0 75 -72 75q-17 0 -47 -6q-95 -19 -149 -19q-226 0 -226 243q0 86 30 204q-83 -127 -83 -275q0 -150 89 -260.5t235 -110.5q111 0 210 70q13 48 13 79zM884 1223q0 50 -32 89.5t-81 39.5
+t-81 -39.5t-32 -89.5q0 -51 31.5 -90.5t81.5 -39.5t81.5 39.5t31.5 90.5zM1513 884q0 96 -37.5 179t-113 137t-173.5 54q-77 0 -149 -35t-127 -94q-48 -159 -48 -268q0 -104 45.5 -157t147.5 -53q53 0 142 19q36 6 53 6q51 0 77.5 -28t26.5 -80q0 -26 -4 -46
+q75 68 117.5 165.5t42.5 200.5zM1792 667q0 -111 -33.5 -249.5t-93.5 -204.5q-58 -64 -195 -142.5t-228 -104.5l-4 -1v-114q0 -43 -29.5 -75t-72.5 -32q-56 0 -86 50q-32 -50 -87 -50t-87 50q-30 -50 -86 -50q-55 0 -87 50q-30 -50 -86 -50q-47 0 -75 33.5t-28 81.5
+q-90 -68 -198 -68q-118 0 -211 80q54 1 106 20q-113 31 -182 127q32 -7 71 -7q89 0 164 46q-192 192 -240 306q-24 56 -24 160q0 57 9 125.5t31.5 146.5t55 141t86.5 105t120 42q59 0 81 -52q19 29 42 54q2 3 12 13t13 16q10 15 23 38t25 42t28 39q87 111 211.5 177
+t260.5 66q35 0 62 -4q59 64 146 64q83 0 140 -57q5 -5 5 -12q0 -5 -6 -13.5t-12.5 -16t-16 -17l-10.5 -10.5q17 -6 36 -18t19 -24q0 -6 -16 -25q157 -138 197 -378q25 30 60 30q45 0 100 -49q90 -80 90 -279z" />
+    <glyph glyph-name="uniF2B3" unicode="&#xf2b3;" 
+d="M917 631q0 33 -6 64h-362v-132h217q-12 -76 -74.5 -120.5t-142.5 -44.5q-99 0 -169 71.5t-70 170.5t70 170.5t169 71.5q93 0 153 -59l104 101q-108 100 -257 100q-160 0 -272 -112.5t-112 -271.5t112 -271.5t272 -112.5q165 0 266.5 105t101.5 270zM1262 585h109v110
+h-109v110h-110v-110h-110v-110h110v-110h110v110zM1536 640q0 -209 -103 -385.5t-279.5 -279.5t-385.5 -103t-385.5 103t-279.5 279.5t-103 385.5t103 385.5t279.5 279.5t385.5 103t385.5 -103t279.5 -279.5t103 -385.5z" />
+    <glyph glyph-name="uniF2B4" unicode="&#xf2b4;" 
+d="M1536 1024v-839q0 -48 -49 -62q-174 -52 -338 -52q-73 0 -215.5 29.5t-227.5 29.5q-164 0 -370 -48v-338h-160v1368q-63 25 -101 81t-38 124q0 91 64 155t155 64t155 -64t64 -155q0 -68 -38 -124t-101 -81v-68q190 44 343 44q99 0 198 -15q14 -2 111.5 -22.5t149.5 -20.5
+q77 0 165 18q11 2 80 21t89 19q26 0 45 -19t19 -45z" />
+    <glyph glyph-name="uniF2B5" unicode="&#xf2b5;" horiz-adv-x="2304" 
+d="M192 384q40 0 56 32t0 64t-56 32t-56 -32t0 -64t56 -32zM1665 442q-10 13 -38.5 50t-41.5 54t-38 49t-42.5 53t-40.5 47t-45 49l-125 -140q-83 -94 -208.5 -92t-205.5 98q-57 69 -56.5 158t58.5 157l177 206q-22 11 -51 16.5t-47.5 6t-56.5 -0.5t-49 -1q-92 0 -158 -66
+l-158 -158h-155v-544q5 0 21 0.5t22 0t19.5 -2t20.5 -4.5t17.5 -8.5t18.5 -13.5l297 -292q115 -111 227 -111q78 0 125 47q57 -20 112.5 8t72.5 85q74 -6 127 44q20 18 36 45.5t14 50.5q10 -10 43 -10q43 0 77 21t49.5 53t12 71.5t-30.5 73.5zM1824 384h96v512h-93l-157 180
+q-66 76 -169 76h-167q-89 0 -146 -67l-209 -243q-28 -33 -28 -75t27 -75q43 -51 110 -52t111 49l193 218q25 23 53.5 21.5t47 -27t8.5 -56.5q16 -19 56 -63t60 -68q29 -36 82.5 -105.5t64.5 -84.5q52 -66 60 -140zM2112 384q40 0 56 32t0 64t-56 32t-56 -32t0 -64t56 -32z
+M2304 960v-640q0 -26 -19 -45t-45 -19h-434q-27 -65 -82 -106.5t-125 -51.5q-33 -48 -80.5 -81.5t-102.5 -45.5q-42 -53 -104.5 -81.5t-128.5 -24.5q-60 -34 -126 -39.5t-127.5 14t-117 53.5t-103.5 81l-287 282h-358q-26 0 -45 19t-19 45v672q0 26 19 45t45 19h421
+q14 14 47 48t47.5 48t44 40t50.5 37.5t51 25.5t62 19.5t68 5.5h117q99 0 181 -56q82 56 181 56h167q35 0 67 -6t56.5 -14.5t51.5 -26.5t44.5 -31t43 -39.5t39 -42t41 -48t41.5 -48.5h355q26 0 45 -19t19 -45z" />
+    <glyph glyph-name="uniF2B6" unicode="&#xf2b6;" horiz-adv-x="1792" 
+d="M1792 882v-978q0 -66 -47 -113t-113 -47h-1472q-66 0 -113 47t-47 113v978q0 15 11 24q8 7 39 34.5t41.5 36t45.5 37.5t70 55.5t96 73t143.5 107t192.5 140.5q5 4 52.5 40t71.5 52.5t64 35t69 18.5t69 -18.5t65 -35.5t71 -52t52 -40q110 -80 192.5 -140.5t143.5 -107
+t96 -73t70 -55.5t45.5 -37.5t41.5 -36t39 -34.5q11 -9 11 -24zM1228 297q263 191 345 252q11 8 12.5 20.5t-6.5 23.5l-38 52q-8 11 -21 12.5t-24 -6.5q-231 -169 -343 -250q-5 -3 -52 -39t-71.5 -52.5t-64.5 -35t-69 -18.5t-69 18.5t-64.5 35t-71.5 52.5t-52 39
+q-186 134 -343 250q-11 8 -24 6.5t-21 -12.5l-38 -52q-8 -11 -6.5 -23.5t12.5 -20.5q82 -61 345 -252q10 -8 50 -38t65 -47t64 -39.5t77.5 -33.5t75.5 -11t75.5 11t79 34.5t64.5 39.5t65 47.5t48 36.5z" />
+    <glyph glyph-name="uniF2B7" unicode="&#xf2b7;" horiz-adv-x="1792" 
+d="M1474 623l39 -51q8 -11 6.5 -23.5t-11.5 -20.5q-43 -34 -126.5 -98.5t-146.5 -113t-67 -51.5q-39 -32 -60 -48t-60.5 -41t-76.5 -36.5t-74 -11.5h-1h-1q-37 0 -74 11.5t-76 36.5t-61 41.5t-60 47.5q-5 4 -65 50.5t-143.5 111t-122.5 94.5q-11 8 -12.5 20.5t6.5 23.5
+l37 52q8 11 21.5 13t24.5 -7q94 -73 306 -236q5 -4 43.5 -35t60.5 -46.5t56.5 -32.5t58.5 -17h1h1q24 0 58.5 17t56.5 32.5t60.5 46.5t43.5 35q258 198 313 242q11 8 24 6.5t21 -12.5zM1664 -96v928q-90 83 -159 139q-91 74 -389 304q-3 2 -43 35t-61 48t-56 32.5t-59 17.5
+h-1h-1q-24 0 -59 -17.5t-56 -32.5t-61 -48t-43 -35q-215 -166 -315.5 -245.5t-129.5 -104t-82 -74.5q-14 -12 -21 -19v-928q0 -13 9.5 -22.5t22.5 -9.5h1472q13 0 22.5 9.5t9.5 22.5zM1792 832v-928q0 -66 -47 -113t-113 -47h-1472q-66 0 -113 47t-47 113v928q0 56 41 94
+q123 114 350 290.5t233 181.5q36 30 59 47.5t61.5 42t76 36.5t74.5 12h1h1q37 0 74.5 -12t76 -36.5t61.5 -42t59 -47.5q43 -36 156 -122t226 -177t201 -173q41 -38 41 -94z" />
+    <glyph glyph-name="uniF2B8" unicode="&#xf2b8;" 
+d="M330 1l202 -214l-34 236l-216 213zM556 -225l274 218l-11 245l-300 -215zM245 413l227 -213l-48 327l-245 204zM495 189l317 214l-14 324l-352 -200zM843 178l95 -80l-2 239l-103 79q0 -1 1 -8.5t0 -12t-5 -7.5l-78 -52l85 -70q7 -6 7 -88zM138 930l256 -200l-68 465
+l-279 173zM1173 267l15 234l-230 -164l2 -240zM417 722l373 194l-19 441l-423 -163zM1270 357l20 233l-226 142l-2 -105l144 -95q6 -4 4 -9l-7 -119zM1461 496l30 222l-179 -128l-20 -228zM1273 329l-71 49l-8 -117q0 -5 -4 -8l-234 -187q-7 -5 -14 0l-98 83l7 -161
+q0 -5 -4 -8l-293 -234q-4 -2 -6 -2q-8 2 -8 3l-228 242q-4 4 -59 277q-2 7 5 11l61 37q-94 86 -95 92l-72 351q-2 7 6 12l94 45q-133 100 -135 108l-96 466q-2 10 7 13l433 135q5 0 8 -1l317 -153q6 -4 6 -9l20 -463q0 -7 -6 -10l-118 -61l126 -85q5 -2 5 -8l5 -123l121 74
+q5 4 11 0l84 -56l3 110q0 6 5 9l206 126q6 3 11 0l245 -135q4 -4 5 -7t-6.5 -60t-17.5 -124.5t-10 -70.5q0 -5 -4 -7l-191 -153q-6 -5 -13 0z" />
+    <glyph glyph-name="uniF2B9" unicode="&#xf2b9;" horiz-adv-x="1664" 
+d="M1201 298q0 57 -5.5 107t-21 100.5t-39.5 86t-64 58t-91 22.5q-6 -4 -33.5 -20.5t-42.5 -24.5t-40.5 -20t-49 -17t-46.5 -5t-46.5 5t-49 17t-40.5 20t-42.5 24.5t-33.5 20.5q-51 0 -91 -22.5t-64 -58t-39.5 -86t-21 -100.5t-5.5 -107q0 -73 42 -121.5t103 -48.5h576
+q61 0 103 48.5t42 121.5zM1028 892q0 108 -76.5 184t-183.5 76t-183.5 -76t-76.5 -184q0 -107 76.5 -183t183.5 -76t183.5 76t76.5 183zM1664 352v-192q0 -14 -9 -23t-23 -9h-96v-224q0 -66 -47 -113t-113 -47h-1216q-66 0 -113 47t-47 113v1472q0 66 47 113t113 47h1216
+q66 0 113 -47t47 -113v-224h96q14 0 23 -9t9 -23v-192q0 -14 -9 -23t-23 -9h-96v-128h96q14 0 23 -9t9 -23v-192q0 -14 -9 -23t-23 -9h-96v-128h96q14 0 23 -9t9 -23z" />
+    <glyph glyph-name="uniF2BA" unicode="&#xf2ba;" horiz-adv-x="1664" 
+d="M1028 892q0 -107 -76.5 -183t-183.5 -76t-183.5 76t-76.5 183q0 108 76.5 184t183.5 76t183.5 -76t76.5 -184zM980 672q46 0 82.5 -17t60 -47.5t39.5 -67t24 -81t11.5 -82.5t3.5 -79q0 -67 -39.5 -118.5t-105.5 -51.5h-576q-66 0 -105.5 51.5t-39.5 118.5q0 48 4.5 93.5
+t18.5 98.5t36.5 91.5t63 64.5t93.5 26h5q7 -4 32 -19.5t35.5 -21t33 -17t37 -16t35 -9t39.5 -4.5t39.5 4.5t35 9t37 16t33 17t35.5 21t32 19.5zM1664 928q0 -13 -9.5 -22.5t-22.5 -9.5h-96v-128h96q13 0 22.5 -9.5t9.5 -22.5v-192q0 -13 -9.5 -22.5t-22.5 -9.5h-96v-128h96
+q13 0 22.5 -9.5t9.5 -22.5v-192q0 -13 -9.5 -22.5t-22.5 -9.5h-96v-224q0 -66 -47 -113t-113 -47h-1216q-66 0 -113 47t-47 113v1472q0 66 47 113t113 47h1216q66 0 113 -47t47 -113v-224h96q13 0 22.5 -9.5t9.5 -22.5v-192zM1408 -96v1472q0 13 -9.5 22.5t-22.5 9.5h-1216
+q-13 0 -22.5 -9.5t-9.5 -22.5v-1472q0 -13 9.5 -22.5t22.5 -9.5h1216q13 0 22.5 9.5t9.5 22.5z" />
+    <glyph glyph-name="uniF2BB" unicode="&#xf2bb;" horiz-adv-x="2048" 
+d="M1024 405q0 64 -9 117.5t-29.5 103t-60.5 78t-97 28.5q-6 -4 -30 -18t-37.5 -21.5t-35.5 -17.5t-43 -14.5t-42 -4.5t-42 4.5t-43 14.5t-35.5 17.5t-37.5 21.5t-30 18q-57 0 -97 -28.5t-60.5 -78t-29.5 -103t-9 -117.5t37 -106.5t91 -42.5h512q54 0 91 42.5t37 106.5z
+M867 925q0 94 -66.5 160.5t-160.5 66.5t-160.5 -66.5t-66.5 -160.5t66.5 -160.5t160.5 -66.5t160.5 66.5t66.5 160.5zM1792 416v64q0 14 -9 23t-23 9h-576q-14 0 -23 -9t-9 -23v-64q0 -14 9 -23t23 -9h576q14 0 23 9t9 23zM1792 676v56q0 15 -10.5 25.5t-25.5 10.5h-568
+q-15 0 -25.5 -10.5t-10.5 -25.5v-56q0 -15 10.5 -25.5t25.5 -10.5h568q15 0 25.5 10.5t10.5 25.5zM1792 928v64q0 14 -9 23t-23 9h-576q-14 0 -23 -9t-9 -23v-64q0 -14 9 -23t23 -9h576q14 0 23 9t9 23zM2048 1248v-1216q0 -66 -47 -113t-113 -47h-352v96q0 14 -9 23t-23 9
+h-64q-14 0 -23 -9t-9 -23v-96h-768v96q0 14 -9 23t-23 9h-64q-14 0 -23 -9t-9 -23v-96h-352q-66 0 -113 47t-47 113v1216q0 66 47 113t113 47h1728q66 0 113 -47t47 -113z" />
+    <glyph glyph-name="uniF2BC" unicode="&#xf2bc;" horiz-adv-x="2048" 
+d="M1024 405q0 -64 -37 -106.5t-91 -42.5h-512q-54 0 -91 42.5t-37 106.5t9 117.5t29.5 103t60.5 78t97 28.5q6 -4 30 -18t37.5 -21.5t35.5 -17.5t43 -14.5t42 -4.5t42 4.5t43 14.5t35.5 17.5t37.5 21.5t30 18q57 0 97 -28.5t60.5 -78t29.5 -103t9 -117.5zM867 925
+q0 -94 -66.5 -160.5t-160.5 -66.5t-160.5 66.5t-66.5 160.5t66.5 160.5t160.5 66.5t160.5 -66.5t66.5 -160.5zM1792 480v-64q0 -14 -9 -23t-23 -9h-576q-14 0 -23 9t-9 23v64q0 14 9 23t23 9h576q14 0 23 -9t9 -23zM1792 732v-56q0 -15 -10.5 -25.5t-25.5 -10.5h-568
+q-15 0 -25.5 10.5t-10.5 25.5v56q0 15 10.5 25.5t25.5 10.5h568q15 0 25.5 -10.5t10.5 -25.5zM1792 992v-64q0 -14 -9 -23t-23 -9h-576q-14 0 -23 9t-9 23v64q0 14 9 23t23 9h576q14 0 23 -9t9 -23zM1920 32v1216q0 13 -9.5 22.5t-22.5 9.5h-1728q-13 0 -22.5 -9.5
+t-9.5 -22.5v-1216q0 -13 9.5 -22.5t22.5 -9.5h352v96q0 14 9 23t23 9h64q14 0 23 -9t9 -23v-96h768v96q0 14 9 23t23 9h64q14 0 23 -9t9 -23v-96h352q13 0 22.5 9.5t9.5 22.5zM2048 1248v-1216q0 -66 -47 -113t-113 -47h-1728q-66 0 -113 47t-47 113v1216q0 66 47 113
+t113 47h1728q66 0 113 -47t47 -113z" />
+    <glyph glyph-name="uniF2BD" unicode="&#xf2bd;" horiz-adv-x="1792" 
+d="M1523 197q-22 155 -87.5 257.5t-184.5 118.5q-67 -74 -159.5 -115.5t-195.5 -41.5t-195.5 41.5t-159.5 115.5q-119 -16 -184.5 -118.5t-87.5 -257.5q106 -150 271 -237.5t356 -87.5t356 87.5t271 237.5zM1280 896q0 159 -112.5 271.5t-271.5 112.5t-271.5 -112.5
+t-112.5 -271.5t112.5 -271.5t271.5 -112.5t271.5 112.5t112.5 271.5zM1792 640q0 -182 -71 -347.5t-190.5 -286t-285.5 -191.5t-349 -71q-182 0 -348 71t-286 191t-191 286t-71 348t71 348t191 286t286 191t348 71t348 -71t286 -191t191 -286t71 -348z" />
+    <glyph glyph-name="uniF2BE" unicode="&#xf2be;" horiz-adv-x="1792" 
+d="M896 1536q182 0 348 -71t286 -191t191 -286t71 -348q0 -181 -70.5 -347t-190.5 -286t-286 -191.5t-349 -71.5t-349 71t-285.5 191.5t-190.5 286t-71 347.5t71 348t191 286t286 191t348 71zM1515 185q149 205 149 455q0 156 -61 298t-164 245t-245 164t-298 61t-298 -61
+t-245 -164t-164 -245t-61 -298q0 -250 149 -455q66 327 306 327q131 -128 313 -128t313 128q240 0 306 -327zM1280 832q0 159 -112.5 271.5t-271.5 112.5t-271.5 -112.5t-112.5 -271.5t112.5 -271.5t271.5 -112.5t271.5 112.5t112.5 271.5z" />
+    <glyph glyph-name="uniF2C0" unicode="&#xf2c0;" 
+d="M1201 752q47 -14 89.5 -38t89 -73t79.5 -115.5t55 -172t22 -236.5q0 -154 -100 -263.5t-241 -109.5h-854q-141 0 -241 109.5t-100 263.5q0 131 22 236.5t55 172t79.5 115.5t89 73t89.5 38q-79 125 -79 272q0 104 40.5 198.5t109.5 163.5t163.5 109.5t198.5 40.5
+t198.5 -40.5t163.5 -109.5t109.5 -163.5t40.5 -198.5q0 -147 -79 -272zM768 1408q-159 0 -271.5 -112.5t-112.5 -271.5t112.5 -271.5t271.5 -112.5t271.5 112.5t112.5 271.5t-112.5 271.5t-271.5 112.5zM1195 -128q88 0 150.5 71.5t62.5 173.5q0 239 -78.5 377t-225.5 145
+q-145 -127 -336 -127t-336 127q-147 -7 -225.5 -145t-78.5 -377q0 -102 62.5 -173.5t150.5 -71.5h854z" />
+    <glyph glyph-name="uniF2C1" unicode="&#xf2c1;" horiz-adv-x="1280" 
+d="M1024 278q0 -64 -37 -107t-91 -43h-512q-54 0 -91 43t-37 107t9 118t29.5 104t61 78.5t96.5 28.5q80 -75 188 -75t188 75q56 0 96.5 -28.5t61 -78.5t29.5 -104t9 -118zM870 797q0 -94 -67.5 -160.5t-162.5 -66.5t-162.5 66.5t-67.5 160.5t67.5 160.5t162.5 66.5
+t162.5 -66.5t67.5 -160.5zM1152 -96v1376h-1024v-1376q0 -13 9.5 -22.5t22.5 -9.5h960q13 0 22.5 9.5t9.5 22.5zM1280 1376v-1472q0 -66 -47 -113t-113 -47h-960q-66 0 -113 47t-47 113v1472q0 66 47 113t113 47h352v-96q0 -14 9 -23t23 -9h192q14 0 23 9t9 23v96h352
+q66 0 113 -47t47 -113z" />
+    <glyph glyph-name="uniF2C2" unicode="&#xf2c2;" horiz-adv-x="2048" 
+d="M896 324q0 54 -7.5 100.5t-24.5 90t-51 68.5t-81 25q-64 -64 -156 -64t-156 64q-47 0 -81 -25t-51 -68.5t-24.5 -90t-7.5 -100.5q0 -55 31.5 -93.5t75.5 -38.5h426q44 0 75.5 38.5t31.5 93.5zM768 768q0 80 -56 136t-136 56t-136 -56t-56 -136t56 -136t136 -56t136 56
+t56 136zM1792 288v64q0 14 -9 23t-23 9h-704q-14 0 -23 -9t-9 -23v-64q0 -14 9 -23t23 -9h704q14 0 23 9t9 23zM1408 544v64q0 14 -9 23t-23 9h-320q-14 0 -23 -9t-9 -23v-64q0 -14 9 -23t23 -9h320q14 0 23 9t9 23zM1792 544v64q0 14 -9 23t-23 9h-192q-14 0 -23 -9t-9 -23
+v-64q0 -14 9 -23t23 -9h192q14 0 23 9t9 23zM1792 800v64q0 14 -9 23t-23 9h-704q-14 0 -23 -9t-9 -23v-64q0 -14 9 -23t23 -9h704q14 0 23 9t9 23zM128 1152h1792v96q0 14 -9 23t-23 9h-1728q-14 0 -23 -9t-9 -23v-96zM2048 1248v-1216q0 -66 -47 -113t-113 -47h-1728
+q-66 0 -113 47t-47 113v1216q0 66 47 113t113 47h1728q66 0 113 -47t47 -113z" />
+    <glyph glyph-name="uniF2C3" unicode="&#xf2c3;" horiz-adv-x="2048" 
+d="M896 324q0 -55 -31.5 -93.5t-75.5 -38.5h-426q-44 0 -75.5 38.5t-31.5 93.5q0 54 7.5 100.5t24.5 90t51 68.5t81 25q64 -64 156 -64t156 64q47 0 81 -25t51 -68.5t24.5 -90t7.5 -100.5zM768 768q0 -80 -56 -136t-136 -56t-136 56t-56 136t56 136t136 56t136 -56t56 -136z
+M1792 352v-64q0 -14 -9 -23t-23 -9h-704q-14 0 -23 9t-9 23v64q0 14 9 23t23 9h704q14 0 23 -9t9 -23zM1408 608v-64q0 -14 -9 -23t-23 -9h-320q-14 0 -23 9t-9 23v64q0 14 9 23t23 9h320q14 0 23 -9t9 -23zM1792 608v-64q0 -14 -9 -23t-23 -9h-192q-14 0 -23 9t-9 23v64
+q0 14 9 23t23 9h192q14 0 23 -9t9 -23zM1792 864v-64q0 -14 -9 -23t-23 -9h-704q-14 0 -23 9t-9 23v64q0 14 9 23t23 9h704q14 0 23 -9t9 -23zM1920 32v1120h-1792v-1120q0 -13 9.5 -22.5t22.5 -9.5h1728q13 0 22.5 9.5t9.5 22.5zM2048 1248v-1216q0 -66 -47 -113t-113 -47
+h-1728q-66 0 -113 47t-47 113v1216q0 66 47 113t113 47h1728q66 0 113 -47t47 -113z" />
+    <glyph glyph-name="uniF2C4" unicode="&#xf2c4;" horiz-adv-x="1792" 
+d="M1255 749q0 318 -105 474.5t-330 156.5q-222 0 -326 -157t-104 -474q0 -316 104 -471.5t326 -155.5q74 0 131 17q-22 43 -39 73t-44 65t-53.5 56.5t-63 36t-77.5 14.5q-46 0 -79 -16l-49 97q105 91 276 91q132 0 215.5 -54t150.5 -155q67 149 67 402zM1645 117h117
+q3 -27 -2 -67t-26.5 -95t-58 -100.5t-107 -78t-162.5 -32.5q-71 0 -130.5 19t-105.5 56t-79 78t-66 96q-97 -27 -205 -27q-150 0 -292.5 58t-253 158.5t-178 249t-67.5 317.5q0 170 67.5 319.5t178.5 250.5t253.5 159t291.5 58q121 0 238.5 -36t217 -106t176 -164.5
+t119.5 -219t43 -261.5q0 -190 -80.5 -347.5t-218.5 -264.5q47 -70 93.5 -106.5t104.5 -36.5q61 0 94 37.5t38 85.5z" />
+    <glyph glyph-name="uniF2C5" unicode="&#xf2c5;" horiz-adv-x="2304" 
+d="M453 -101q0 -21 -16 -37.5t-37 -16.5q-1 0 -13 3q-63 15 -162 140q-225 284 -225 676q0 341 213 614q39 51 95 103.5t94 52.5q19 0 35 -13.5t16 -32.5q0 -27 -63 -90q-98 -102 -147 -184q-119 -199 -119 -449q0 -281 123 -491q50 -85 136 -173q2 -3 14.5 -16t19.5 -21
+t17 -20.5t14.5 -23.5t4.5 -21zM1796 33q0 -29 -17.5 -48.5t-46.5 -19.5h-1081q-26 0 -45 19t-19 45q0 29 17.5 48.5t46.5 19.5h1081q26 0 45 -19t19 -45zM1581 644q0 -134 -67 -233q-25 -38 -69.5 -78.5t-83.5 -60.5q-16 -10 -27 -10q-7 0 -15 6t-8 12q0 9 19 30t42 46
+t42 67.5t19 88.5q0 76 -35 130q-29 42 -46 42q-3 0 -3 -5q0 -12 7.5 -35.5t7.5 -36.5q0 -22 -21.5 -35t-44.5 -13q-66 0 -66 76q0 15 1.5 44t1.5 44q0 25 -10 46q-13 25 -42 53.5t-51 28.5q-5 0 -7 -0.5t-3.5 -2.5t-1.5 -6q0 -2 16 -26t16 -54q0 -37 -19 -68t-46 -54
+t-53.5 -46t-45.5 -54t-19 -68q0 -98 42 -160q29 -43 79 -63q16 -5 17 -10q1 -2 1 -5q0 -16 -18 -16q-6 0 -33 11q-119 43 -195 139.5t-76 218.5q0 55 24.5 115.5t60 115t70.5 108.5t59.5 113.5t24.5 111.5q0 53 -25 94q-29 48 -56 64q-19 9 -19 21q0 20 41 20q50 0 110 -29
+q41 -19 71 -44.5t49.5 -51t33.5 -62.5t22 -69t16 -80q0 -1 3 -17.5t4.5 -25t5.5 -25t9 -27t11 -21.5t14.5 -16.5t18.5 -5.5q23 0 37 14t14 37q0 25 -20 67t-20 52t10 10q27 0 93 -70q72 -76 102.5 -156t30.5 -186zM2304 615q0 -274 -138 -503q-19 -32 -48 -72t-68 -86.5
+t-81 -77t-74 -30.5q-16 0 -31 15.5t-15 31.5q0 15 29 50.5t68.5 77t48.5 52.5q183 230 183 531q0 131 -20.5 235t-72.5 211q-58 119 -163 228q-2 3 -13 13.5t-16.5 16.5t-15 17.5t-15 20t-9.5 18.5t-4 19q0 19 16 35.5t35 16.5q70 0 196 -169q98 -131 146 -273t60 -314
+q2 -42 2 -64z" />
+    <glyph glyph-name="uniF2C6" unicode="&#xf2c6;" horiz-adv-x="1792" 
+d="M1189 229l147 693q9 44 -10.5 63t-51.5 7l-864 -333q-29 -11 -39.5 -25t-2.5 -26.5t32 -19.5l221 -69l513 323q21 14 32 6q7 -5 -4 -15l-415 -375v0v0l-16 -228q23 0 45 22l108 104l224 -165q64 -36 81 38zM1792 640q0 -182 -71 -348t-191 -286t-286 -191t-348 -71
+t-348 71t-286 191t-191 286t-71 348t71 348t191 286t286 191t348 71t348 -71t286 -191t191 -286t71 -348z" />
+    <glyph glyph-name="uniF2C7" unicode="&#xf2c7;" horiz-adv-x="1024" 
+d="M640 192q0 -80 -56 -136t-136 -56t-136 56t-56 136q0 60 35 110t93 71v907h128v-907q58 -21 93 -71t35 -110zM768 192q0 77 -34 144t-94 112v768q0 80 -56 136t-136 56t-136 -56t-56 -136v-768q-60 -45 -94 -112t-34 -144q0 -133 93.5 -226.5t226.5 -93.5t226.5 93.5
+t93.5 226.5zM896 192q0 -185 -131.5 -316.5t-316.5 -131.5t-316.5 131.5t-131.5 316.5q0 182 128 313v711q0 133 93.5 226.5t226.5 93.5t226.5 -93.5t93.5 -226.5v-711q128 -131 128 -313zM1024 768v-128h-192v128h192zM1024 1024v-128h-192v128h192zM1024 1280v-128h-192
+v128h192z" />
+    <glyph glyph-name="uniF2C8" unicode="&#xf2c8;" horiz-adv-x="1024" 
+d="M640 192q0 -80 -56 -136t-136 -56t-136 56t-56 136q0 60 35 110t93 71v651h128v-651q58 -21 93 -71t35 -110zM768 192q0 77 -34 144t-94 112v768q0 80 -56 136t-136 56t-136 -56t-56 -136v-768q-60 -45 -94 -112t-34 -144q0 -133 93.5 -226.5t226.5 -93.5t226.5 93.5
+t93.5 226.5zM896 192q0 -185 -131.5 -316.5t-316.5 -131.5t-316.5 131.5t-131.5 316.5q0 182 128 313v711q0 133 93.5 226.5t226.5 93.5t226.5 -93.5t93.5 -226.5v-711q128 -131 128 -313zM1024 768v-128h-192v128h192zM1024 1024v-128h-192v128h192zM1024 1280v-128h-192
+v128h192z" />
+    <glyph glyph-name="uniF2C9" unicode="&#xf2c9;" horiz-adv-x="1024" 
+d="M640 192q0 -80 -56 -136t-136 -56t-136 56t-56 136q0 60 35 110t93 71v395h128v-395q58 -21 93 -71t35 -110zM768 192q0 77 -34 144t-94 112v768q0 80 -56 136t-136 56t-136 -56t-56 -136v-768q-60 -45 -94 -112t-34 -144q0 -133 93.5 -226.5t226.5 -93.5t226.5 93.5
+t93.5 226.5zM896 192q0 -185 -131.5 -316.5t-316.5 -131.5t-316.5 131.5t-131.5 316.5q0 182 128 313v711q0 133 93.5 226.5t226.5 93.5t226.5 -93.5t93.5 -226.5v-711q128 -131 128 -313zM1024 768v-128h-192v128h192zM1024 1024v-128h-192v128h192zM1024 1280v-128h-192
+v128h192z" />
+    <glyph glyph-name="uniF2CA" unicode="&#xf2ca;" horiz-adv-x="1024" 
+d="M640 192q0 -80 -56 -136t-136 -56t-136 56t-56 136q0 60 35 110t93 71v139h128v-139q58 -21 93 -71t35 -110zM768 192q0 77 -34 144t-94 112v768q0 80 -56 136t-136 56t-136 -56t-56 -136v-768q-60 -45 -94 -112t-34 -144q0 -133 93.5 -226.5t226.5 -93.5t226.5 93.5
+t93.5 226.5zM896 192q0 -185 -131.5 -316.5t-316.5 -131.5t-316.5 131.5t-131.5 316.5q0 182 128 313v711q0 133 93.5 226.5t226.5 93.5t226.5 -93.5t93.5 -226.5v-711q128 -131 128 -313zM1024 768v-128h-192v128h192zM1024 1024v-128h-192v128h192zM1024 1280v-128h-192
+v128h192z" />
+    <glyph glyph-name="uniF2CB" unicode="&#xf2cb;" horiz-adv-x="1024" 
+d="M640 192q0 -80 -56 -136t-136 -56t-136 56t-56 136q0 79 56 135.5t136 56.5t136 -56.5t56 -135.5zM768 192q0 77 -34 144t-94 112v768q0 80 -56 136t-136 56t-136 -56t-56 -136v-768q-60 -45 -94 -112t-34 -144q0 -133 93.5 -226.5t226.5 -93.5t226.5 93.5t93.5 226.5z
+M896 192q0 -185 -131.5 -316.5t-316.5 -131.5t-316.5 131.5t-131.5 316.5q0 182 128 313v711q0 133 93.5 226.5t226.5 93.5t226.5 -93.5t93.5 -226.5v-711q128 -131 128 -313zM1024 768v-128h-192v128h192zM1024 1024v-128h-192v128h192zM1024 1280v-128h-192v128h192z" />
+    <glyph glyph-name="uniF2CC" unicode="&#xf2cc;" horiz-adv-x="1920" 
+d="M1433 1287q10 -10 10 -23t-10 -23l-626 -626q-10 -10 -23 -10t-23 10l-82 82q-10 10 -10 23t10 23l44 44q-72 91 -81.5 207t46.5 215q-74 71 -176 71q-106 0 -181 -75t-75 -181v-1280h-256v1280q0 104 40.5 198.5t109.5 163.5t163.5 109.5t198.5 40.5q106 0 201 -41
+t166 -115q94 39 197 24.5t185 -79.5l44 44q10 10 23 10t23 -10zM1344 1024q26 0 45 -19t19 -45t-19 -45t-45 -19t-45 19t-19 45t19 45t45 19zM1600 896q-26 0 -45 19t-19 45t19 45t45 19t45 -19t19 -45t-19 -45t-45 -19zM1856 1024q26 0 45 -19t19 -45t-19 -45t-45 -19
+t-45 19t-19 45t19 45t45 19zM1216 896q26 0 45 -19t19 -45t-19 -45t-45 -19t-45 19t-19 45t19 45t45 19zM1408 832q0 26 19 45t45 19t45 -19t19 -45t-19 -45t-45 -19t-45 19t-19 45zM1728 896q26 0 45 -19t19 -45t-19 -45t-45 -19t-45 19t-19 45t19 45t45 19zM1088 768
+q26 0 45 -19t19 -45t-19 -45t-45 -19t-45 19t-19 45t19 45t45 19zM1344 640q-26 0 -45 19t-19 45t19 45t45 19t45 -19t19 -45t-19 -45t-45 -19zM1600 768q26 0 45 -19t19 -45t-19 -45t-45 -19t-45 19t-19 45t19 45t45 19zM1216 512q-26 0 -45 19t-19 45t19 45t45 19t45 -19
+t19 -45t-19 -45t-45 -19zM1472 640q26 0 45 -19t19 -45t-19 -45t-45 -19t-45 19t-19 45t19 45t45 19zM1088 512q26 0 45 -19t19 -45t-19 -45t-45 -19t-45 19t-19 45t19 45t45 19zM1344 512q26 0 45 -19t19 -45t-19 -45t-45 -19t-45 19t-19 45t19 45t45 19zM1216 384
+q26 0 45 -19t19 -45t-19 -45t-45 -19t-45 19t-19 45t19 45t45 19zM1088 256q26 0 45 -19t19 -45t-19 -45t-45 -19t-45 19t-19 45t19 45t45 19z" />
+    <glyph glyph-name="uniF2CD" unicode="&#xf2cd;" horiz-adv-x="1792" 
+d="M1664 448v-192q0 -169 -128 -286v-194q0 -14 -9 -23t-23 -9h-64q-14 0 -23 9t-9 23v118q-63 -22 -128 -22h-768q-65 0 -128 22v-110q0 -17 -9.5 -28.5t-22.5 -11.5h-64q-13 0 -22.5 11.5t-9.5 28.5v186q-128 117 -128 286v192h1536zM704 864q0 -14 -9 -23t-23 -9t-23 9
+t-9 23t9 23t23 9t23 -9t9 -23zM768 928q0 -14 -9 -23t-23 -9t-23 9t-9 23t9 23t23 9t23 -9t9 -23zM704 992q0 -14 -9 -23t-23 -9t-23 9t-9 23t9 23t23 9t23 -9t9 -23zM832 992q0 -14 -9 -23t-23 -9t-23 9t-9 23t9 23t23 9t23 -9t9 -23zM768 1056q0 -14 -9 -23t-23 -9t-23 9
+t-9 23t9 23t23 9t23 -9t9 -23zM704 1120q0 -14 -9 -23t-23 -9t-23 9t-9 23t9 23t23 9t23 -9t9 -23zM1792 608v-64q0 -14 -9 -23t-23 -9h-1728q-14 0 -23 9t-9 23v64q0 14 9 23t23 9h96v640q0 106 75 181t181 75q108 0 184 -78q46 19 98 12t93 -39l22 22q11 11 22 0l42 -42
+q11 -11 0 -22l-314 -314q-11 -11 -22 0l-42 42q-11 11 0 22l22 22q-36 46 -40.5 104t23.5 108q-37 35 -88 35q-53 0 -90.5 -37.5t-37.5 -90.5v-640h1504q14 0 23 -9t9 -23zM896 1056q0 -14 -9 -23t-23 -9t-23 9t-9 23t9 23t23 9t23 -9t9 -23zM832 1120q0 -14 -9 -23t-23 -9
+t-23 9t-9 23t9 23t23 9t23 -9t9 -23zM768 1184q0 -14 -9 -23t-23 -9t-23 9t-9 23t9 23t23 9t23 -9t9 -23zM960 1120q0 -14 -9 -23t-23 -9t-23 9t-9 23t9 23t23 9t23 -9t9 -23zM896 1184q0 -14 -9 -23t-23 -9t-23 9t-9 23t9 23t23 9t23 -9t9 -23zM832 1248q0 -14 -9 -23
+t-23 -9t-23 9t-9 23t9 23t23 9t23 -9t9 -23zM1024 1184q0 -14 -9 -23t-23 -9t-23 9t-9 23t9 23t23 9t23 -9t9 -23zM960 1248q0 -14 -9 -23t-23 -9t-23 9t-9 23t9 23t23 9t23 -9t9 -23zM1088 1248q0 -14 -9 -23t-23 -9t-23 9t-9 23t9 23t23 9t23 -9t9 -23z" />
+    <glyph glyph-name="uniF2CE" unicode="&#xf2ce;" 
+d="M994 344q0 -86 -17 -197q-31 -215 -55 -313q-22 -90 -152 -90t-152 90q-24 98 -55 313q-17 110 -17 197q0 168 224 168t224 -168zM1536 768q0 -240 -134 -434t-350 -280q-8 -3 -15 3t-6 15q7 48 10 66q4 32 6 47q1 9 9 12q159 81 255.5 234t96.5 337q0 180 -91 330.5
+t-247 234.5t-337 74q-124 -7 -237 -61t-193.5 -140.5t-128 -202t-46.5 -240.5q1 -184 99 -336.5t257 -231.5q7 -3 9 -12q3 -21 6 -45q1 -9 5 -32.5t6 -35.5q1 -9 -6.5 -15t-15.5 -2q-148 58 -261 169.5t-173.5 264t-52.5 319.5q7 143 66 273.5t154.5 227t225 157.5t272.5 70
+q164 10 315.5 -46.5t261 -160.5t175 -250.5t65.5 -308.5zM994 800q0 -93 -65.5 -158.5t-158.5 -65.5t-158.5 65.5t-65.5 158.5t65.5 158.5t158.5 65.5t158.5 -65.5t65.5 -158.5zM1282 768q0 -122 -53.5 -228.5t-146.5 -177.5q-8 -6 -16 -2t-10 14q-6 52 -29 92q-7 10 3 20
+q58 54 91 127t33 155q0 111 -58.5 204t-157.5 141.5t-212 36.5q-133 -15 -229 -113t-109 -231q-10 -92 23.5 -176t98.5 -144q10 -10 3 -20q-24 -41 -29 -93q-2 -9 -10 -13t-16 2q-95 74 -148.5 183t-51.5 234q3 131 69 244t177 181.5t241 74.5q144 7 268 -60t196.5 -187.5
+t72.5 -263.5z" />
+    <glyph glyph-name="uniF2D0" unicode="&#xf2d0;" horiz-adv-x="1792" 
+d="M256 128h1280v768h-1280v-768zM1792 1248v-1216q0 -66 -47 -113t-113 -47h-1472q-66 0 -113 47t-47 113v1216q0 66 47 113t113 47h1472q66 0 113 -47t47 -113z" />
+    <glyph glyph-name="uniF2D1" unicode="&#xf2d1;" horiz-adv-x="1792" 
+d="M1792 224v-192q0 -66 -47 -113t-113 -47h-1472q-66 0 -113 47t-47 113v192q0 66 47 113t113 47h1472q66 0 113 -47t47 -113z" />
+    <glyph glyph-name="uniF2D2" unicode="&#xf2d2;" horiz-adv-x="2048" 
+d="M256 0h768v512h-768v-512zM1280 512h512v768h-768v-256h96q66 0 113 -47t47 -113v-352zM2048 1376v-960q0 -66 -47 -113t-113 -47h-608v-352q0 -66 -47 -113t-113 -47h-960q-66 0 -113 47t-47 113v960q0 66 47 113t113 47h608v352q0 66 47 113t113 47h960q66 0 113 -47
+t47 -113z" />
+    <glyph glyph-name="uniF2D3" unicode="&#xf2d3;" horiz-adv-x="1792" 
+d="M1175 215l146 146q10 10 10 23t-10 23l-233 233l233 233q10 10 10 23t-10 23l-146 146q-10 10 -23 10t-23 -10l-233 -233l-233 233q-10 10 -23 10t-23 -10l-146 -146q-10 -10 -10 -23t10 -23l233 -233l-233 -233q-10 -10 -10 -23t10 -23l146 -146q10 -10 23 -10t23 10
+l233 233l233 -233q10 -10 23 -10t23 10zM1792 1248v-1216q0 -66 -47 -113t-113 -47h-1472q-66 0 -113 47t-47 113v1216q0 66 47 113t113 47h1472q66 0 113 -47t47 -113z" />
+    <glyph glyph-name="uniF2D4" unicode="&#xf2d4;" horiz-adv-x="1792" 
+d="M1257 425l-146 -146q-10 -10 -23 -10t-23 10l-169 169l-169 -169q-10 -10 -23 -10t-23 10l-146 146q-10 10 -10 23t10 23l169 169l-169 169q-10 10 -10 23t10 23l146 146q10 10 23 10t23 -10l169 -169l169 169q10 10 23 10t23 -10l146 -146q10 -10 10 -23t-10 -23
+l-169 -169l169 -169q10 -10 10 -23t-10 -23zM256 128h1280v1024h-1280v-1024zM1792 1248v-1216q0 -66 -47 -113t-113 -47h-1472q-66 0 -113 47t-47 113v1216q0 66 47 113t113 47h1472q66 0 113 -47t47 -113z" />
+    <glyph glyph-name="uniF2D5" unicode="&#xf2d5;" horiz-adv-x="1792" 
+d="M1070 358l306 564h-654l-306 -564h654zM1792 640q0 -182 -71 -348t-191 -286t-286 -191t-348 -71t-348 71t-286 191t-191 286t-71 348t71 348t191 286t286 191t348 71t348 -71t286 -191t191 -286t71 -348z" />
+    <glyph glyph-name="uniF2D6" unicode="&#xf2d6;" horiz-adv-x="1794" 
+d="M1291 1060q-15 17 -35 8.5t-26 -28.5t5 -38q14 -17 40 -14.5t34 20.5t-18 52zM895 814q-8 -8 -19.5 -8t-18.5 8q-8 8 -8 19t8 18q7 8 18.5 8t19.5 -8q7 -7 7 -18t-7 -19zM1060 740l-35 -35q-12 -13 -29.5 -13t-30.5 13l-38 38q-12 13 -12 30t12 30l35 35q12 12 29.5 12
+t30.5 -12l38 -39q12 -12 12 -29.5t-12 -29.5zM951 870q-7 -8 -18.5 -8t-19.5 8q-7 8 -7 19t7 19q8 8 19 8t19 -8t8 -19t-8 -19zM1354 968q-34 -64 -107.5 -85.5t-127.5 16.5q-38 28 -61 66.5t-21 87.5t39 92t75.5 53t70.5 -5t70 -51q2 -2 13 -12.5t14.5 -13.5t13 -13.5
+t12.5 -15.5t10 -15.5t8.5 -18t4 -18.5t1 -21t-5 -22t-9.5 -24zM1555 486q3 20 -8.5 34.5t-27.5 21.5t-33 17t-23 20q-40 71 -84 98.5t-113 11.5q19 13 40 18.5t33 4.5l12 -1q2 45 -34 90q6 20 6.5 40.5t-2.5 30.5l-3 10q43 24 71 65t34 91q10 84 -43 150.5t-137 76.5
+q-60 7 -114 -18.5t-82 -74.5q-30 -51 -33.5 -101t14.5 -87t43.5 -64t56.5 -42q-45 4 -88 36t-57 88q-28 108 32 222q-16 21 -29 32q-50 0 -89 -19q19 24 42 37t36 14l13 1q0 50 -13 78q-10 21 -32.5 28.5t-47 -3.5t-37.5 -40q2 4 4 7q-7 -28 -6.5 -75.5t19 -117t48.5 -122.5
+q-25 -14 -47 -36q-35 -16 -85.5 -70.5t-84.5 -101.5l-33 -46q-90 -34 -181 -125.5t-75 -162.5q1 -16 11 -27q-15 -12 -30 -30q-21 -25 -21 -54t21.5 -40t63.5 6q41 19 77 49.5t55 60.5q-2 2 -6.5 5t-20.5 7.5t-33 3.5q23 5 51 12.5t40 10t27.5 6t26 4t23.5 0.5q14 -7 22 34
+q7 37 7 90q0 102 -40 150q106 -103 101 -219q-1 -29 -15 -50t-27 -27l-13 -6q-4 -7 -19 -32t-26 -45.5t-26.5 -52t-25 -61t-17 -63t-6.5 -66.5t10 -63q-35 54 -37 80q-22 -24 -34.5 -39t-33.5 -42t-30.5 -46t-16.5 -41t-0.5 -38t25.5 -27q45 -25 144 64t190.5 221.5
+t122.5 228.5q86 52 145 115.5t86 119.5q47 -93 154 -178q104 -83 167 -80q39 2 46 43zM1794 640q0 -182 -71 -348t-191 -286t-286.5 -191t-348.5 -71t-348.5 71t-286.5 191t-191 286t-71 348t71 348t191 286t286.5 191t348.5 71t348.5 -71t286.5 -191t191 -286t71 -348z" />
+    <glyph glyph-name="uniF2D7" unicode="&#xf2d7;" 
+d="M518 1353v-655q103 -1 191.5 1.5t125.5 5.5l37 3q68 2 90.5 24.5t39.5 94.5l33 142h103l-14 -322l7 -319h-103l-29 127q-15 68 -45 93t-84 26q-87 8 -352 8v-556q0 -78 43.5 -115.5t133.5 -37.5h357q35 0 59.5 2t55 7.5t54 18t48.5 32t46 50.5t39 73l93 216h89
+q-6 -37 -31.5 -252t-30.5 -276q-146 5 -263.5 8t-162.5 4h-44h-628l-376 -12v102l127 25q67 13 91.5 37t25.5 79l8 643q3 402 -8 645q-2 61 -25.5 84t-91.5 36l-127 24v102l376 -12h702q139 0 374 27q-6 -68 -14 -194.5t-12 -219.5l-5 -92h-93l-32 124q-31 121 -74 179.5
+t-113 58.5h-548q-28 0 -35.5 -8.5t-7.5 -30.5z" />
+    <glyph glyph-name="uniF2D8" unicode="&#xf2d8;" 
+d="M922 739v-182q0 -4 0.5 -15t0 -15l-1.5 -12t-3.5 -11.5t-6.5 -7.5t-11 -5.5t-16 -1.5v309q9 0 16 -1t11 -5t6.5 -5.5t3.5 -9.5t1 -10.5v-13.5v-14zM1238 643v-121q0 -1 0.5 -12.5t0 -15.5t-2.5 -11.5t-7.5 -10.5t-13.5 -3q-9 0 -14 9q-4 10 -4 165v7v8.5v9t1.5 8.5l3.5 7
+t5 5.5t8 1.5q6 0 10 -1.5t6.5 -4.5t4 -6t2 -8.5t0.5 -8v-9.5v-9zM180 407h122v472h-122v-472zM614 407h106v472h-159l-28 -221q-20 148 -32 221h-158v-472h107v312l45 -312h76l43 319v-319zM1039 712q0 67 -5 90q-3 16 -11 28.5t-17 20.5t-25 14t-26.5 8.5t-31 4t-29 1.5
+h-29.5h-12h-91v-472h56q169 -1 197 24.5t25 180.5q-1 62 -1 100zM1356 515v133q0 29 -2 45t-9.5 33.5t-24.5 25t-46 7.5q-46 0 -77 -34v154h-117v-472h110l7 30q30 -36 77 -36q50 0 66 30.5t16 83.5zM1536 1248v-1216q0 -66 -47 -113t-113 -47h-1216q-66 0 -113 47t-47 113
+v1216q0 66 47 113t113 47h1216q66 0 113 -47t47 -113z" />
+    <glyph glyph-name="uniF2D9" unicode="&#xf2d9;" horiz-adv-x="2176" 
+d="M1143 -197q-6 1 -11 4q-13 8 -36 23t-86 65t-116.5 104.5t-112 140t-89.5 172.5q-17 3 -175 37q66 -213 235 -362t391 -184zM502 409l168 -28q-25 76 -41 167.5t-19 145.5l-4 53q-84 -82 -121 -224q5 -65 17 -114zM612 1018q-43 -64 -77 -148q44 46 74 68zM2049 584
+q0 161 -62 307t-167.5 252t-250.5 168.5t-304 62.5q-147 0 -281 -52.5t-240 -148.5q-30 -58 -45 -160q60 51 143 83.5t158.5 43t143 13.5t108.5 -1l40 -3q33 -1 53 -15.5t24.5 -33t6.5 -37t-1 -28.5q-126 11 -227.5 0.5t-183 -43.5t-142.5 -71.5t-131 -98.5
+q4 -36 11.5 -92.5t35.5 -178t62 -179.5q123 -6 247.5 14.5t214.5 53.5t162.5 67t109.5 59l37 24q22 16 39.5 20.5t30.5 -5t17 -34.5q14 -97 -39 -121q-208 -97 -467 -134q-135 -20 -317 -16q41 -96 110 -176.5t137 -127t130.5 -79t101.5 -43.5l39 -12q143 -23 263 15
+q195 99 314 289t119 418zM2123 621q-14 -135 -40 -212q-70 -208 -181.5 -346.5t-318.5 -253.5q-48 -33 -82 -44q-72 -26 -163 -16q-36 -3 -73 -3q-283 0 -504.5 173t-295.5 442q-1 0 -4 0.5t-5 0.5q-6 -50 2.5 -112.5t26 -115t36 -98t31.5 -71.5l14 -26q8 -12 54 -82
+q-71 38 -124.5 106.5t-78.5 140t-39.5 137t-17.5 107.5l-2 42q-5 2 -33.5 12.5t-48.5 18t-53 20.5t-57.5 25t-50 25.5t-42.5 27t-25 25.5q19 -10 50.5 -25.5t113 -45.5t145.5 -38l2 32q11 149 94 290q41 202 176 365q28 115 81 214q15 28 32 45t49 32q158 74 303.5 104
+t302 11t306.5 -97q220 -115 333 -336t87 -474z" />
+    <glyph glyph-name="uniF2DA" unicode="&#xf2da;" horiz-adv-x="1792" 
+d="M1341 752q29 44 -6.5 129.5t-121.5 142.5q-58 39 -125.5 53.5t-118 4.5t-68.5 -37q-12 -23 -4.5 -28t42.5 -10q23 -3 38.5 -5t44.5 -9.5t56 -17.5q36 -13 67.5 -31.5t53 -37t40 -38.5t30.5 -38t22 -34.5t16.5 -28.5t12 -18.5t10.5 -6t11 9.5zM1704 178
+q-52 -127 -148.5 -220t-214.5 -141.5t-253 -60.5t-266 13.5t-251 91t-210 161.5t-141.5 235.5t-46.5 303.5q1 41 8.5 84.5t12.5 64t24 80.5t23 73q-51 -208 1 -397t173 -318t291 -206t346 -83t349 74.5t289 244.5q20 27 18 14q0 -4 -4 -14zM1465 627q0 -104 -40.5 -199
+t-108.5 -164t-162 -109.5t-198 -40.5t-198 40.5t-162 109.5t-108.5 164t-40.5 199t40.5 199t108.5 164t162 109.5t198 40.5t198 -40.5t162 -109.5t108.5 -164t40.5 -199zM1752 915q-65 147 -180.5 251t-253 153.5t-292 53.5t-301 -36.5t-275.5 -129t-220 -211.5t-131 -297
+t-10 -373q-49 161 -51.5 311.5t35.5 272.5t109 227t165.5 180.5t207 126t232 71t242.5 9t236 -54t216 -124.5t178 -197q33 -50 62 -121t31 -112zM1690 573q12 244 -136.5 416t-396.5 240q-8 0 -10 5t24 8q125 -4 230 -50t173 -120t116 -168.5t58.5 -199t-1 -208
+t-61.5 -197.5t-122.5 -167t-185 -117.5t-248.5 -46.5q108 30 201.5 80t174 123t129.5 176.5t55 225.5z" />
+    <glyph glyph-name="uniF2DB" unicode="&#xf2db;" 
+d="M192 256v-128h-112q-16 0 -16 16v16h-48q-16 0 -16 16v32q0 16 16 16h48v16q0 16 16 16h112zM192 512v-128h-112q-16 0 -16 16v16h-48q-16 0 -16 16v32q0 16 16 16h48v16q0 16 16 16h112zM192 768v-128h-112q-16 0 -16 16v16h-48q-16 0 -16 16v32q0 16 16 16h48v16
+q0 16 16 16h112zM192 1024v-128h-112q-16 0 -16 16v16h-48q-16 0 -16 16v32q0 16 16 16h48v16q0 16 16 16h112zM192 1280v-128h-112q-16 0 -16 16v16h-48q-16 0 -16 16v32q0 16 16 16h48v16q0 16 16 16h112zM1280 1440v-1472q0 -40 -28 -68t-68 -28h-832q-40 0 -68 28
+t-28 68v1472q0 40 28 68t68 28h832q40 0 68 -28t28 -68zM1536 208v-32q0 -16 -16 -16h-48v-16q0 -16 -16 -16h-112v128h112q16 0 16 -16v-16h48q16 0 16 -16zM1536 464v-32q0 -16 -16 -16h-48v-16q0 -16 -16 -16h-112v128h112q16 0 16 -16v-16h48q16 0 16 -16zM1536 720v-32
+q0 -16 -16 -16h-48v-16q0 -16 -16 -16h-112v128h112q16 0 16 -16v-16h48q16 0 16 -16zM1536 976v-32q0 -16 -16 -16h-48v-16q0 -16 -16 -16h-112v128h112q16 0 16 -16v-16h48q16 0 16 -16zM1536 1232v-32q0 -16 -16 -16h-48v-16q0 -16 -16 -16h-112v128h112q16 0 16 -16v-16
+h48q16 0 16 -16z" />
+    <glyph glyph-name="uniF2DC" unicode="&#xf2dc;" horiz-adv-x="1664" 
+d="M1566 419l-167 -33l186 -107q23 -13 29.5 -38.5t-6.5 -48.5q-14 -23 -39 -29.5t-48 6.5l-186 106l55 -160q13 -38 -12 -63.5t-60.5 -20.5t-48.5 42l-102 300l-271 156v-313l208 -238q16 -18 17 -39t-11 -36.5t-28.5 -25t-37 -5.5t-36.5 22l-112 128v-214q0 -26 -19 -45
+t-45 -19t-45 19t-19 45v214l-112 -128q-16 -18 -36.5 -22t-37 5.5t-28.5 25t-11 36.5t17 39l208 238v313l-271 -156l-102 -300q-13 -37 -48.5 -42t-60.5 20.5t-12 63.5l55 160l-186 -106q-23 -13 -48 -6.5t-39 29.5q-13 23 -6.5 48.5t29.5 38.5l186 107l-167 33
+q-29 6 -42 29t-8.5 46.5t25.5 40t50 10.5l310 -62l271 157l-271 157l-310 -62q-4 -1 -13 -1q-27 0 -44 18t-19 40t11 43t40 26l167 33l-186 107q-23 13 -29.5 38.5t6.5 48.5t39 30t48 -7l186 -106l-55 160q-13 38 12 63.5t60.5 20.5t48.5 -42l102 -300l271 -156v313
+l-208 238q-16 18 -17 39t11 36.5t28.5 25t37 5.5t36.5 -22l112 -128v214q0 26 19 45t45 19t45 -19t19 -45v-214l112 128q16 18 36.5 22t37 -5.5t28.5 -25t11 -36.5t-17 -39l-208 -238v-313l271 156l102 300q13 37 48.5 42t60.5 -20.5t12 -63.5l-55 -160l186 106
+q23 13 48 6.5t39 -29.5q13 -23 6.5 -48.5t-29.5 -38.5l-186 -107l167 -33q27 -5 40 -26t11 -43t-19 -40t-44 -18q-9 0 -13 1l-310 62l-271 -157l271 -157l310 62q29 6 50 -10.5t25.5 -40t-8.5 -46.5t-42 -29z" />
+    <glyph glyph-name="uniF2DD" unicode="&#xf2dd;" horiz-adv-x="1792" 
+d="M1473 607q7 118 -33 226.5t-113 189t-177 131t-221 57.5q-116 7 -225.5 -32t-192 -110.5t-135 -175t-59.5 -220.5q-7 -118 33 -226.5t113 -189t177.5 -131t221.5 -57.5q155 -9 293 59t224 195.5t94 283.5zM1792 1536l-349 -348q120 -117 180.5 -272t50.5 -321
+q-11 -183 -102 -339t-241 -255.5t-332 -124.5l-999 -132l347 347q-120 116 -180.5 271.5t-50.5 321.5q11 184 102 340t241.5 255.5t332.5 124.5q167 22 500 66t500 66z" />
+    <glyph glyph-name="uniF2DE" unicode="&#xf2de;" horiz-adv-x="1792" 
+d="M948 508l163 -329h-51l-175 350l-171 -350h-49l179 374l-78 33l21 49l240 -102l-21 -50zM563 1100l304 -130l-130 -304l-304 130zM907 915l240 -103l-103 -239l-239 102zM1188 765l191 -81l-82 -190l-190 81zM1680 640q0 159 -62 304t-167.5 250.5t-250.5 167.5t-304 62
+t-304 -62t-250.5 -167.5t-167.5 -250.5t-62 -304t62 -304t167.5 -250.5t250.5 -167.5t304 -62t304 62t250.5 167.5t167.5 250.5t62 304zM1792 640q0 -182 -71 -348t-191 -286t-286 -191t-348 -71t-348 71t-286 191t-191 286t-71 348t71 348t191 286t286 191t348 71t348 -71
+t286 -191t191 -286t71 -348z" />
+    <glyph glyph-name="uniF2E0" unicode="&#xf2e0;" horiz-adv-x="1920" 
+d="M1334 302q-4 24 -27.5 34t-49.5 10.5t-48.5 12.5t-25.5 38q-5 47 33 139.5t75 181t32 127.5q-14 101 -117 103q-45 1 -75 -16l-3 -2l-5 -2.5t-4.5 -2t-5 -2t-5 -0.5t-6 1.5t-6 3.5t-6.5 5q-3 2 -9 8.5t-9 9t-8.5 7.5t-9.5 7.5t-9.5 5.5t-11 4.5t-11.5 2.5q-30 5 -48 -3
+t-45 -31q-1 -1 -9 -8.5t-12.5 -11t-15 -10t-16.5 -5.5t-17 3q-54 27 -84 40q-41 18 -94 -5t-76 -65q-16 -28 -41 -98.5t-43.5 -132.5t-40 -134t-21.5 -73q-22 -69 18.5 -119t110.5 -46q30 2 50.5 15t38.5 46q7 13 79 199.5t77 194.5q6 11 21.5 18t29.5 0q27 -15 21 -53
+q-2 -18 -51 -139.5t-50 -132.5q-6 -38 19.5 -56.5t60.5 -7t55 49.5q4 8 45.5 92t81.5 163.5t46 88.5q20 29 41 28q29 0 25 -38q-2 -16 -65.5 -147.5t-70.5 -159.5q-12 -53 13 -103t74 -74q17 -9 51 -15.5t71.5 -8t62.5 14t20 48.5zM383 86q3 -15 -5 -27.5t-23 -15.5
+q-14 -3 -26.5 5t-15.5 23q-3 14 5 27t22 16t27 -5t16 -23zM953 -177q12 -17 8.5 -37.5t-20.5 -32.5t-37.5 -8t-32.5 21q-11 17 -7.5 37.5t20.5 32.5t37.5 8t31.5 -21zM177 635q-18 -27 -49.5 -33t-57.5 13q-26 18 -32 50t12 58q18 27 49.5 33t57.5 -12q26 -19 32 -50.5
+t-12 -58.5zM1467 -42q19 -28 13 -61.5t-34 -52.5t-60.5 -13t-51.5 34t-13 61t33 53q28 19 60.5 13t52.5 -34zM1579 562q69 -113 42.5 -244.5t-134.5 -207.5q-90 -63 -199 -60q-20 -80 -84.5 -127t-143.5 -44.5t-140 57.5q-12 -9 -13 -10q-103 -71 -225 -48.5t-193 126.5
+q-50 73 -53 164q-83 14 -142.5 70.5t-80.5 128t-2 152t81 138.5q-36 60 -38 128t24.5 125t79.5 98.5t121 50.5q32 85 99 148t146.5 91.5t168 17t159.5 -66.5q72 21 140 17.5t128.5 -36t104.5 -80t67.5 -115t17.5 -140.5q52 -16 87 -57t45.5 -89t-5.5 -99.5t-58 -87.5z
+M455 1222q14 -20 9.5 -44.5t-24.5 -38.5q-19 -14 -43.5 -9.5t-37.5 24.5q-14 20 -9.5 44.5t24.5 38.5q19 14 43.5 9.5t37.5 -24.5zM614 1503q4 -16 -5 -30.5t-26 -18.5t-31 5.5t-18 26.5q-3 17 6.5 31t25.5 18q17 4 31 -5.5t17 -26.5zM1800 555q4 -20 -6.5 -37t-30.5 -21
+q-19 -4 -36 6.5t-21 30.5t6.5 37t30.5 22q20 4 36.5 -7.5t20.5 -30.5zM1136 1448q16 -27 8.5 -58.5t-35.5 -47.5q-27 -16 -57.5 -8.5t-46.5 34.5q-16 28 -8.5 59t34.5 48t58 9t47 -36zM1882 792q4 -15 -4 -27.5t-23 -16.5q-15 -3 -27.5 5.5t-15.5 22.5q-3 15 5 28t23 16
+q14 3 26.5 -5t15.5 -23zM1691 1033q15 -22 10.5 -49t-26.5 -43q-22 -15 -49 -10t-42 27t-10 49t27 43t48.5 11t41.5 -28z" />
+    <glyph glyph-name="uniF2E1" unicode="&#xf2e1;" horiz-adv-x="1792" 
+ />
+    <glyph glyph-name="uniF2E2" unicode="&#xf2e2;" horiz-adv-x="1792" 
+ />
+    <glyph glyph-name="uniF2E3" unicode="&#xf2e3;" horiz-adv-x="1792" 
+ />
+    <glyph glyph-name="uniF2E4" unicode="&#xf2e4;" horiz-adv-x="1792" 
+ />
+    <glyph glyph-name="uniF2E5" unicode="&#xf2e5;" horiz-adv-x="1792" 
+ />
+    <glyph glyph-name="uniF2E6" unicode="&#xf2e6;" horiz-adv-x="1792" 
+ />
+    <glyph glyph-name="uniF2E7" unicode="&#xf2e7;" horiz-adv-x="1792" 
+ />
+    <glyph glyph-name="_698" unicode="&#xf2e8;" horiz-adv-x="1792" 
+ />
+    <glyph glyph-name="uniF2E9" unicode="&#xf2e9;" horiz-adv-x="1792" 
+ />
+    <glyph glyph-name="uniF2EA" unicode="&#xf2ea;" horiz-adv-x="1792" 
+ />
+    <glyph glyph-name="uniF2EB" unicode="&#xf2eb;" horiz-adv-x="1792" 
+ />
+    <glyph glyph-name="uniF2EC" unicode="&#xf2ec;" horiz-adv-x="1792" 
+ />
+    <glyph glyph-name="uniF2ED" unicode="&#xf2ed;" horiz-adv-x="1792" 
+ />
+    <glyph glyph-name="uniF2EE" unicode="&#xf2ee;" horiz-adv-x="1792" 
+ />
+    <glyph glyph-name="lessequal" unicode="&#xf500;" horiz-adv-x="1792" 
+ />
+  </font>
+</defs></svg>
diff --git a/css/fonts/fontawesome-webfont.ttf b/css/fonts/fontawesome-webfont.ttf
new file mode 100644
index 00000000..35acda2f
Binary files /dev/null and b/css/fonts/fontawesome-webfont.ttf differ
diff --git a/css/fonts/fontawesome-webfont.woff b/css/fonts/fontawesome-webfont.woff
new file mode 100644
index 00000000..400014a4
Binary files /dev/null and b/css/fonts/fontawesome-webfont.woff differ
diff --git a/css/fonts/fontawesome-webfont.woff2 b/css/fonts/fontawesome-webfont.woff2
new file mode 100644
index 00000000..4d13fc60
Binary files /dev/null and b/css/fonts/fontawesome-webfont.woff2 differ
diff --git a/css/fonts/lato-bold-italic.woff b/css/fonts/lato-bold-italic.woff
new file mode 100644
index 00000000..88ad05b9
Binary files /dev/null and b/css/fonts/lato-bold-italic.woff differ
diff --git a/css/fonts/lato-bold-italic.woff2 b/css/fonts/lato-bold-italic.woff2
new file mode 100644
index 00000000..c4e3d804
Binary files /dev/null and b/css/fonts/lato-bold-italic.woff2 differ
diff --git a/css/fonts/lato-bold.woff b/css/fonts/lato-bold.woff
new file mode 100644
index 00000000..c6dff51f
Binary files /dev/null and b/css/fonts/lato-bold.woff differ
diff --git a/css/fonts/lato-bold.woff2 b/css/fonts/lato-bold.woff2
new file mode 100644
index 00000000..bb195043
Binary files /dev/null and b/css/fonts/lato-bold.woff2 differ
diff --git a/css/fonts/lato-normal-italic.woff b/css/fonts/lato-normal-italic.woff
new file mode 100644
index 00000000..76114bc0
Binary files /dev/null and b/css/fonts/lato-normal-italic.woff differ
diff --git a/css/fonts/lato-normal-italic.woff2 b/css/fonts/lato-normal-italic.woff2
new file mode 100644
index 00000000..3404f37e
Binary files /dev/null and b/css/fonts/lato-normal-italic.woff2 differ
diff --git a/css/fonts/lato-normal.woff b/css/fonts/lato-normal.woff
new file mode 100644
index 00000000..ae1307ff
Binary files /dev/null and b/css/fonts/lato-normal.woff differ
diff --git a/css/fonts/lato-normal.woff2 b/css/fonts/lato-normal.woff2
new file mode 100644
index 00000000..3bf98433
Binary files /dev/null and b/css/fonts/lato-normal.woff2 differ
diff --git a/css/theme.css b/css/theme.css
new file mode 100644
index 00000000..ad773009
--- /dev/null
+++ b/css/theme.css
@@ -0,0 +1,13 @@
+/*
+ * This file is copied from the upstream ReadTheDocs Sphinx
+ * theme. To aid upgradability this file should *not* be edited.
+ * modifications we need should be included in theme_extra.css.
+ *
+ * https://github.com/readthedocs/sphinx_rtd_theme
+ */
+
+ /* sphinx_rtd_theme version 1.2.0 | MIT license */
+html{box-sizing:border-box}*,:after,:before{box-sizing:inherit}article,aside,details,figcaption,figure,footer,header,hgroup,nav,section{display:block}audio,canvas,video{display:inline-block;*display:inline;*zoom:1}[hidden],audio:not([controls]){display:none}*{-webkit-box-sizing:border-box;-moz-box-sizing:border-box;box-sizing:border-box}html{font-size:100%;-webkit-text-size-adjust:100%;-ms-text-size-adjust:100%}body{margin:0}a:active,a:hover{outline:0}abbr[title]{border-bottom:1px dotted}b,strong{font-weight:700}blockquote{margin:0}dfn{font-style:italic}ins{background:#ff9;text-decoration:none}ins,mark{color:#000}mark{background:#ff0;font-style:italic;font-weight:700}.rst-content code,.rst-content tt,code,kbd,pre,samp{font-family:monospace,serif;_font-family:courier new,monospace;font-size:1em}pre{white-space:pre}q{quotes:none}q:after,q:before{content:"";content:none}small{font-size:85%}sub,sup{font-size:75%;line-height:0;position:relative;vertical-align:baseline}sup{top:-.5em}sub{bottom:-.25em}dl,ol,ul{margin:0;padding:0;list-style:none;list-style-image:none}li{list-style:none}dd{margin:0}img{border:0;-ms-interpolation-mode:bicubic;vertical-align:middle;max-width:100%}svg:not(:root){overflow:hidden}figure,form{margin:0}label{cursor:pointer}button,input,select,textarea{font-size:100%;margin:0;vertical-align:baseline;*vertical-align:middle}button,input{line-height:normal}button,input[type=button],input[type=reset],input[type=submit]{cursor:pointer;-webkit-appearance:button;*overflow:visible}button[disabled],input[disabled]{cursor:default}input[type=search]{-webkit-appearance:textfield;-moz-box-sizing:content-box;-webkit-box-sizing:content-box;box-sizing:content-box}textarea{resize:vertical}table{border-collapse:collapse;border-spacing:0}td{vertical-align:top}.chromeframe{margin:.2em 0;background:#ccc;color:#000;padding:.2em 0}.ir{display:block;border:0;text-indent:-999em;overflow:hidden;background-color:transparent;background-repeat:no-repeat;text-align:left;direction:ltr;*line-height:0}.ir br{display:none}.hidden{display:none!important;visibility:hidden}.visuallyhidden{border:0;clip:rect(0 0 0 0);height:1px;margin:-1px;overflow:hidden;padding:0;position:absolute;width:1px}.visuallyhidden.focusable:active,.visuallyhidden.focusable:focus{clip:auto;height:auto;margin:0;overflow:visible;position:static;width:auto}.invisible{visibility:hidden}.relative{position:relative}big,small{font-size:100%}@media print{body,html,section{background:none!important}*{box-shadow:none!important;text-shadow:none!important;filter:none!important;-ms-filter:none!important}a,a:visited{text-decoration:underline}.ir a:after,a[href^="#"]:after,a[href^="javascript:"]:after{content:""}blockquote,pre{page-break-inside:avoid}thead{display:table-header-group}img,tr{page-break-inside:avoid}img{max-width:100%!important}@page{margin:.5cm}.rst-content .toctree-wrapper>p.caption,h2,h3,p{orphans:3;widows:3}.rst-content .toctree-wrapper>p.caption,h2,h3{page-break-after:avoid}}.btn,.fa:before,.icon:before,.rst-content .admonition,.rst-content .admonition-title:before,.rst-content .admonition-todo,.rst-content .attention,.rst-content .caution,.rst-content .code-block-caption .headerlink:before,.rst-content .danger,.rst-content .eqno .headerlink:before,.rst-content .error,.rst-content .hint,.rst-content .important,.rst-content .note,.rst-content .seealso,.rst-content .tip,.rst-content .warning,.rst-content code.download span:first-child:before,.rst-content dl dt .headerlink:before,.rst-content h1 .headerlink:before,.rst-content h2 .headerlink:before,.rst-content h3 .headerlink:before,.rst-content h4 .headerlink:before,.rst-content h5 .headerlink:before,.rst-content h6 .headerlink:before,.rst-content p.caption .headerlink:before,.rst-content p .headerlink:before,.rst-content table>caption .headerlink:before,.rst-content tt.download span:first-child:before,.wy-alert,.wy-dropdown .caret:before,.wy-inline-validate.wy-inline-validate-danger .wy-input-context:before,.wy-inline-validate.wy-inline-validate-info .wy-input-context:before,.wy-inline-validate.wy-inline-validate-success .wy-input-context:before,.wy-inline-validate.wy-inline-validate-warning .wy-input-context:before,.wy-menu-vertical li.current>a button.toctree-expand:before,.wy-menu-vertical li.on a button.toctree-expand:before,.wy-menu-vertical li button.toctree-expand:before,input[type=color],input[type=date],input[type=datetime-local],input[type=datetime],input[type=email],input[type=month],input[type=number],input[type=password],input[type=search],input[type=tel],input[type=text],input[type=time],input[type=url],input[type=week],select,textarea{-webkit-font-smoothing:antialiased}.clearfix{*zoom:1}.clearfix:after,.clearfix:before{display:table;content:""}.clearfix:after{clear:both}/*!
+ *  Font Awesome 4.7.0 by @davegandy - http://fontawesome.io - @fontawesome
+ *  License - http://fontawesome.io/license (Font: SIL OFL 1.1, CSS: MIT License)
+ */@font-face{font-family:FontAwesome;src:url(fonts/fontawesome-webfont.eot?674f50d287a8c48dc19ba404d20fe713);src:url(fonts/fontawesome-webfont.eot?674f50d287a8c48dc19ba404d20fe713?#iefix&v=4.7.0) format("embedded-opentype"),url(fonts/fontawesome-webfont.woff2?af7ae505a9eed503f8b8e6982036873e) format("woff2"),url(fonts/fontawesome-webfont.woff?fee66e712a8a08eef5805a46892932ad) format("woff"),url(fonts/fontawesome-webfont.ttf?b06871f281fee6b241d60582ae9369b9) format("truetype"),url(fonts/fontawesome-webfont.svg?912ec66d7572ff821749319396470bde#fontawesomeregular) format("svg");font-weight:400;font-style:normal}.fa,.icon,.rst-content .admonition-title,.rst-content .code-block-caption .headerlink,.rst-content .eqno .headerlink,.rst-content code.download span:first-child,.rst-content dl dt .headerlink,.rst-content h1 .headerlink,.rst-content h2 .headerlink,.rst-content h3 .headerlink,.rst-content h4 .headerlink,.rst-content h5 .headerlink,.rst-content h6 .headerlink,.rst-content p.caption .headerlink,.rst-content p .headerlink,.rst-content table>caption .headerlink,.rst-content tt.download span:first-child,.wy-menu-vertical li.current>a button.toctree-expand,.wy-menu-vertical li.on a button.toctree-expand,.wy-menu-vertical li button.toctree-expand{display:inline-block;font:normal normal normal 14px/1 FontAwesome;font-size:inherit;text-rendering:auto;-webkit-font-smoothing:antialiased;-moz-osx-font-smoothing:grayscale}.fa-lg{font-size:1.33333em;line-height:.75em;vertical-align:-15%}.fa-2x{font-size:2em}.fa-3x{font-size:3em}.fa-4x{font-size:4em}.fa-5x{font-size:5em}.fa-fw{width:1.28571em;text-align:center}.fa-ul{padding-left:0;margin-left:2.14286em;list-style-type:none}.fa-ul>li{position:relative}.fa-li{position:absolute;left:-2.14286em;width:2.14286em;top:.14286em;text-align:center}.fa-li.fa-lg{left:-1.85714em}.fa-border{padding:.2em .25em .15em;border:.08em solid #eee;border-radius:.1em}.fa-pull-left{float:left}.fa-pull-right{float:right}.fa-pull-left.icon,.fa.fa-pull-left,.rst-content .code-block-caption .fa-pull-left.headerlink,.rst-content .eqno .fa-pull-left.headerlink,.rst-content .fa-pull-left.admonition-title,.rst-content code.download span.fa-pull-left:first-child,.rst-content dl dt .fa-pull-left.headerlink,.rst-content h1 .fa-pull-left.headerlink,.rst-content h2 .fa-pull-left.headerlink,.rst-content h3 .fa-pull-left.headerlink,.rst-content h4 .fa-pull-left.headerlink,.rst-content h5 .fa-pull-left.headerlink,.rst-content h6 .fa-pull-left.headerlink,.rst-content p .fa-pull-left.headerlink,.rst-content table>caption .fa-pull-left.headerlink,.rst-content tt.download span.fa-pull-left:first-child,.wy-menu-vertical li.current>a button.fa-pull-left.toctree-expand,.wy-menu-vertical li.on a button.fa-pull-left.toctree-expand,.wy-menu-vertical li button.fa-pull-left.toctree-expand{margin-right:.3em}.fa-pull-right.icon,.fa.fa-pull-right,.rst-content .code-block-caption .fa-pull-right.headerlink,.rst-content .eqno .fa-pull-right.headerlink,.rst-content .fa-pull-right.admonition-title,.rst-content code.download span.fa-pull-right:first-child,.rst-content dl dt .fa-pull-right.headerlink,.rst-content h1 .fa-pull-right.headerlink,.rst-content h2 .fa-pull-right.headerlink,.rst-content h3 .fa-pull-right.headerlink,.rst-content h4 .fa-pull-right.headerlink,.rst-content h5 .fa-pull-right.headerlink,.rst-content h6 .fa-pull-right.headerlink,.rst-content p .fa-pull-right.headerlink,.rst-content table>caption .fa-pull-right.headerlink,.rst-content tt.download span.fa-pull-right:first-child,.wy-menu-vertical li.current>a button.fa-pull-right.toctree-expand,.wy-menu-vertical li.on a button.fa-pull-right.toctree-expand,.wy-menu-vertical li button.fa-pull-right.toctree-expand{margin-left:.3em}.pull-right{float:right}.pull-left{float:left}.fa.pull-left,.pull-left.icon,.rst-content .code-block-caption .pull-left.headerlink,.rst-content .eqno .pull-left.headerlink,.rst-content .pull-left.admonition-title,.rst-content code.download span.pull-left:first-child,.rst-content dl dt .pull-left.headerlink,.rst-content h1 .pull-left.headerlink,.rst-content h2 .pull-left.headerlink,.rst-content h3 .pull-left.headerlink,.rst-content h4 .pull-left.headerlink,.rst-content h5 .pull-left.headerlink,.rst-content h6 .pull-left.headerlink,.rst-content p .pull-left.headerlink,.rst-content table>caption .pull-left.headerlink,.rst-content tt.download span.pull-left:first-child,.wy-menu-vertical li.current>a button.pull-left.toctree-expand,.wy-menu-vertical li.on a button.pull-left.toctree-expand,.wy-menu-vertical li button.pull-left.toctree-expand{margin-right:.3em}.fa.pull-right,.pull-right.icon,.rst-content .code-block-caption .pull-right.headerlink,.rst-content .eqno .pull-right.headerlink,.rst-content .pull-right.admonition-title,.rst-content code.download span.pull-right:first-child,.rst-content dl dt .pull-right.headerlink,.rst-content h1 .pull-right.headerlink,.rst-content h2 .pull-right.headerlink,.rst-content h3 .pull-right.headerlink,.rst-content h4 .pull-right.headerlink,.rst-content h5 .pull-right.headerlink,.rst-content h6 .pull-right.headerlink,.rst-content p .pull-right.headerlink,.rst-content table>caption .pull-right.headerlink,.rst-content tt.download span.pull-right:first-child,.wy-menu-vertical li.current>a button.pull-right.toctree-expand,.wy-menu-vertical li.on a button.pull-right.toctree-expand,.wy-menu-vertical li button.pull-right.toctree-expand{margin-left:.3em}.fa-spin{-webkit-animation:fa-spin 2s linear infinite;animation:fa-spin 2s linear infinite}.fa-pulse{-webkit-animation:fa-spin 1s steps(8) infinite;animation:fa-spin 1s steps(8) infinite}@-webkit-keyframes fa-spin{0%{-webkit-transform:rotate(0deg);transform:rotate(0deg)}to{-webkit-transform:rotate(359deg);transform:rotate(359deg)}}@keyframes fa-spin{0%{-webkit-transform:rotate(0deg);transform:rotate(0deg)}to{-webkit-transform:rotate(359deg);transform:rotate(359deg)}}.fa-rotate-90{-ms-filter:"progid:DXImageTransform.Microsoft.BasicImage(rotation=1)";-webkit-transform:rotate(90deg);-ms-transform:rotate(90deg);transform:rotate(90deg)}.fa-rotate-180{-ms-filter:"progid:DXImageTransform.Microsoft.BasicImage(rotation=2)";-webkit-transform:rotate(180deg);-ms-transform:rotate(180deg);transform:rotate(180deg)}.fa-rotate-270{-ms-filter:"progid:DXImageTransform.Microsoft.BasicImage(rotation=3)";-webkit-transform:rotate(270deg);-ms-transform:rotate(270deg);transform:rotate(270deg)}.fa-flip-horizontal{-ms-filter:"progid:DXImageTransform.Microsoft.BasicImage(rotation=0, mirror=1)";-webkit-transform:scaleX(-1);-ms-transform:scaleX(-1);transform:scaleX(-1)}.fa-flip-vertical{-ms-filter:"progid:DXImageTransform.Microsoft.BasicImage(rotation=2, mirror=1)";-webkit-transform:scaleY(-1);-ms-transform:scaleY(-1);transform:scaleY(-1)}:root .fa-flip-horizontal,:root .fa-flip-vertical,:root .fa-rotate-90,:root .fa-rotate-180,:root .fa-rotate-270{filter:none}.fa-stack{position:relative;display:inline-block;width:2em;height:2em;line-height:2em;vertical-align:middle}.fa-stack-1x,.fa-stack-2x{position:absolute;left:0;width:100%;text-align:center}.fa-stack-1x{line-height:inherit}.fa-stack-2x{font-size:2em}.fa-inverse{color:#fff}.fa-glass:before{content:""}.fa-music:before{content:""}.fa-search:before,.icon-search:before{content:""}.fa-envelope-o:before{content:""}.fa-heart:before{content:""}.fa-star:before{content:""}.fa-star-o:before{content:""}.fa-user:before{content:""}.fa-film:before{content:""}.fa-th-large:before{content:""}.fa-th:before{content:""}.fa-th-list:before{content:""}.fa-check:before{content:""}.fa-close:before,.fa-remove:before,.fa-times:before{content:""}.fa-search-plus:before{content:""}.fa-search-minus:before{content:""}.fa-power-off:before{content:""}.fa-signal:before{content:""}.fa-cog:before,.fa-gear:before{content:""}.fa-trash-o:before{content:""}.fa-home:before,.icon-home:before{content:""}.fa-file-o:before{content:""}.fa-clock-o:before{content:""}.fa-road:before{content:""}.fa-download:before,.rst-content code.download span:first-child:before,.rst-content tt.download span:first-child:before{content:""}.fa-arrow-circle-o-down:before{content:""}.fa-arrow-circle-o-up:before{content:""}.fa-inbox:before{content:""}.fa-play-circle-o:before{content:""}.fa-repeat:before,.fa-rotate-right:before{content:""}.fa-refresh:before{content:""}.fa-list-alt:before{content:""}.fa-lock:before{content:""}.fa-flag:before{content:""}.fa-headphones:before{content:""}.fa-volume-off:before{content:""}.fa-volume-down:before{content:""}.fa-volume-up:before{content:""}.fa-qrcode:before{content:""}.fa-barcode:before{content:""}.fa-tag:before{content:""}.fa-tags:before{content:""}.fa-book:before,.icon-book:before{content:""}.fa-bookmark:before{content:""}.fa-print:before{content:""}.fa-camera:before{content:""}.fa-font:before{content:""}.fa-bold:before{content:""}.fa-italic:before{content:""}.fa-text-height:before{content:""}.fa-text-width:before{content:""}.fa-align-left:before{content:""}.fa-align-center:before{content:""}.fa-align-right:before{content:""}.fa-align-justify:before{content:""}.fa-list:before{content:""}.fa-dedent:before,.fa-outdent:before{content:""}.fa-indent:before{content:""}.fa-video-camera:before{content:""}.fa-image:before,.fa-photo:before,.fa-picture-o:before{content:""}.fa-pencil:before{content:""}.fa-map-marker:before{content:""}.fa-adjust:before{content:""}.fa-tint:before{content:""}.fa-edit:before,.fa-pencil-square-o:before{content:""}.fa-share-square-o:before{content:""}.fa-check-square-o:before{content:""}.fa-arrows:before{content:""}.fa-step-backward:before{content:""}.fa-fast-backward:before{content:""}.fa-backward:before{content:""}.fa-play:before{content:""}.fa-pause:before{content:""}.fa-stop:before{content:""}.fa-forward:before{content:""}.fa-fast-forward:before{content:""}.fa-step-forward:before{content:""}.fa-eject:before{content:""}.fa-chevron-left:before{content:""}.fa-chevron-right:before{content:""}.fa-plus-circle:before{content:""}.fa-minus-circle:before{content:""}.fa-times-circle:before,.wy-inline-validate.wy-inline-validate-danger .wy-input-context:before{content:""}.fa-check-circle:before,.wy-inline-validate.wy-inline-validate-success .wy-input-context:before{content:""}.fa-question-circle:before{content:""}.fa-info-circle:before{content:""}.fa-crosshairs:before{content:""}.fa-times-circle-o:before{content:""}.fa-check-circle-o:before{content:""}.fa-ban:before{content:""}.fa-arrow-left:before{content:""}.fa-arrow-right:before{content:""}.fa-arrow-up:before{content:""}.fa-arrow-down:before{content:""}.fa-mail-forward:before,.fa-share:before{content:""}.fa-expand:before{content:""}.fa-compress:before{content:""}.fa-plus:before{content:""}.fa-minus:before{content:""}.fa-asterisk:before{content:""}.fa-exclamation-circle:before,.rst-content .admonition-title:before,.wy-inline-validate.wy-inline-validate-info .wy-input-context:before,.wy-inline-validate.wy-inline-validate-warning .wy-input-context:before{content:""}.fa-gift:before{content:""}.fa-leaf:before{content:""}.fa-fire:before,.icon-fire:before{content:""}.fa-eye:before{content:""}.fa-eye-slash:before{content:""}.fa-exclamation-triangle:before,.fa-warning:before{content:""}.fa-plane:before{content:""}.fa-calendar:before{content:""}.fa-random:before{content:""}.fa-comment:before{content:""}.fa-magnet:before{content:""}.fa-chevron-up:before{content:""}.fa-chevron-down:before{content:""}.fa-retweet:before{content:""}.fa-shopping-cart:before{content:""}.fa-folder:before{content:""}.fa-folder-open:before{content:""}.fa-arrows-v:before{content:""}.fa-arrows-h:before{content:""}.fa-bar-chart-o:before,.fa-bar-chart:before{content:""}.fa-twitter-square:before{content:""}.fa-facebook-square:before{content:""}.fa-camera-retro:before{content:""}.fa-key:before{content:""}.fa-cogs:before,.fa-gears:before{content:""}.fa-comments:before{content:""}.fa-thumbs-o-up:before{content:""}.fa-thumbs-o-down:before{content:""}.fa-star-half:before{content:""}.fa-heart-o:before{content:""}.fa-sign-out:before{content:""}.fa-linkedin-square:before{content:""}.fa-thumb-tack:before{content:""}.fa-external-link:before{content:""}.fa-sign-in:before{content:""}.fa-trophy:before{content:""}.fa-github-square:before{content:""}.fa-upload:before{content:""}.fa-lemon-o:before{content:""}.fa-phone:before{content:""}.fa-square-o:before{content:""}.fa-bookmark-o:before{content:""}.fa-phone-square:before{content:""}.fa-twitter:before{content:""}.fa-facebook-f:before,.fa-facebook:before{content:""}.fa-github:before,.icon-github:before{content:""}.fa-unlock:before{content:""}.fa-credit-card:before{content:""}.fa-feed:before,.fa-rss:before{content:""}.fa-hdd-o:before{content:""}.fa-bullhorn:before{content:""}.fa-bell:before{content:""}.fa-certificate:before{content:""}.fa-hand-o-right:before{content:""}.fa-hand-o-left:before{content:""}.fa-hand-o-up:before{content:""}.fa-hand-o-down:before{content:""}.fa-arrow-circle-left:before,.icon-circle-arrow-left:before{content:""}.fa-arrow-circle-right:before,.icon-circle-arrow-right:before{content:""}.fa-arrow-circle-up:before{content:""}.fa-arrow-circle-down:before{content:""}.fa-globe:before{content:""}.fa-wrench:before{content:""}.fa-tasks:before{content:""}.fa-filter:before{content:""}.fa-briefcase:before{content:""}.fa-arrows-alt:before{content:""}.fa-group:before,.fa-users:before{content:""}.fa-chain:before,.fa-link:before,.icon-link:before{content:""}.fa-cloud:before{content:""}.fa-flask:before{content:""}.fa-cut:before,.fa-scissors:before{content:""}.fa-copy:before,.fa-files-o:before{content:""}.fa-paperclip:before{content:""}.fa-floppy-o:before,.fa-save:before{content:""}.fa-square:before{content:""}.fa-bars:before,.fa-navicon:before,.fa-reorder:before{content:""}.fa-list-ul:before{content:""}.fa-list-ol:before{content:""}.fa-strikethrough:before{content:""}.fa-underline:before{content:""}.fa-table:before{content:""}.fa-magic:before{content:""}.fa-truck:before{content:""}.fa-pinterest:before{content:""}.fa-pinterest-square:before{content:""}.fa-google-plus-square:before{content:""}.fa-google-plus:before{content:""}.fa-money:before{content:""}.fa-caret-down:before,.icon-caret-down:before,.wy-dropdown .caret:before{content:""}.fa-caret-up:before{content:""}.fa-caret-left:before{content:""}.fa-caret-right:before{content:""}.fa-columns:before{content:""}.fa-sort:before,.fa-unsorted:before{content:""}.fa-sort-desc:before,.fa-sort-down:before{content:""}.fa-sort-asc:before,.fa-sort-up:before{content:""}.fa-envelope:before{content:""}.fa-linkedin:before{content:""}.fa-rotate-left:before,.fa-undo:before{content:""}.fa-gavel:before,.fa-legal:before{content:""}.fa-dashboard:before,.fa-tachometer:before{content:""}.fa-comment-o:before{content:""}.fa-comments-o:before{content:""}.fa-bolt:before,.fa-flash:before{content:""}.fa-sitemap:before{content:""}.fa-umbrella:before{content:""}.fa-clipboard:before,.fa-paste:before{content:""}.fa-lightbulb-o:before{content:""}.fa-exchange:before{content:""}.fa-cloud-download:before{content:""}.fa-cloud-upload:before{content:""}.fa-user-md:before{content:""}.fa-stethoscope:before{content:""}.fa-suitcase:before{content:""}.fa-bell-o:before{content:""}.fa-coffee:before{content:""}.fa-cutlery:before{content:""}.fa-file-text-o:before{content:""}.fa-building-o:before{content:""}.fa-hospital-o:before{content:""}.fa-ambulance:before{content:""}.fa-medkit:before{content:""}.fa-fighter-jet:before{content:""}.fa-beer:before{content:""}.fa-h-square:before{content:""}.fa-plus-square:before{content:""}.fa-angle-double-left:before{content:""}.fa-angle-double-right:before{content:""}.fa-angle-double-up:before{content:""}.fa-angle-double-down:before{content:""}.fa-angle-left:before{content:""}.fa-angle-right:before{content:""}.fa-angle-up:before{content:""}.fa-angle-down:before{content:""}.fa-desktop:before{content:""}.fa-laptop:before{content:""}.fa-tablet:before{content:""}.fa-mobile-phone:before,.fa-mobile:before{content:""}.fa-circle-o:before{content:""}.fa-quote-left:before{content:""}.fa-quote-right:before{content:""}.fa-spinner:before{content:""}.fa-circle:before{content:""}.fa-mail-reply:before,.fa-reply:before{content:""}.fa-github-alt:before{content:""}.fa-folder-o:before{content:""}.fa-folder-open-o:before{content:""}.fa-smile-o:before{content:""}.fa-frown-o:before{content:""}.fa-meh-o:before{content:""}.fa-gamepad:before{content:""}.fa-keyboard-o:before{content:""}.fa-flag-o:before{content:""}.fa-flag-checkered:before{content:""}.fa-terminal:before{content:""}.fa-code:before{content:""}.fa-mail-reply-all:before,.fa-reply-all:before{content:""}.fa-star-half-empty:before,.fa-star-half-full:before,.fa-star-half-o:before{content:""}.fa-location-arrow:before{content:""}.fa-crop:before{content:""}.fa-code-fork:before{content:""}.fa-chain-broken:before,.fa-unlink:before{content:""}.fa-question:before{content:""}.fa-info:before{content:""}.fa-exclamation:before{content:""}.fa-superscript:before{content:""}.fa-subscript:before{content:""}.fa-eraser:before{content:""}.fa-puzzle-piece:before{content:""}.fa-microphone:before{content:""}.fa-microphone-slash:before{content:""}.fa-shield:before{content:""}.fa-calendar-o:before{content:""}.fa-fire-extinguisher:before{content:""}.fa-rocket:before{content:""}.fa-maxcdn:before{content:""}.fa-chevron-circle-left:before{content:""}.fa-chevron-circle-right:before{content:""}.fa-chevron-circle-up:before{content:""}.fa-chevron-circle-down:before{content:""}.fa-html5:before{content:""}.fa-css3:before{content:""}.fa-anchor:before{content:""}.fa-unlock-alt:before{content:""}.fa-bullseye:before{content:""}.fa-ellipsis-h:before{content:""}.fa-ellipsis-v:before{content:""}.fa-rss-square:before{content:""}.fa-play-circle:before{content:""}.fa-ticket:before{content:""}.fa-minus-square:before{content:""}.fa-minus-square-o:before,.wy-menu-vertical li.current>a button.toctree-expand:before,.wy-menu-vertical li.on a button.toctree-expand:before{content:""}.fa-level-up:before{content:""}.fa-level-down:before{content:""}.fa-check-square:before{content:""}.fa-pencil-square:before{content:""}.fa-external-link-square:before{content:""}.fa-share-square:before{content:""}.fa-compass:before{content:""}.fa-caret-square-o-down:before,.fa-toggle-down:before{content:""}.fa-caret-square-o-up:before,.fa-toggle-up:before{content:""}.fa-caret-square-o-right:before,.fa-toggle-right:before{content:""}.fa-eur:before,.fa-euro:before{content:""}.fa-gbp:before{content:""}.fa-dollar:before,.fa-usd:before{content:""}.fa-inr:before,.fa-rupee:before{content:""}.fa-cny:before,.fa-jpy:before,.fa-rmb:before,.fa-yen:before{content:""}.fa-rouble:before,.fa-rub:before,.fa-ruble:before{content:""}.fa-krw:before,.fa-won:before{content:""}.fa-bitcoin:before,.fa-btc:before{content:""}.fa-file:before{content:""}.fa-file-text:before{content:""}.fa-sort-alpha-asc:before{content:""}.fa-sort-alpha-desc:before{content:""}.fa-sort-amount-asc:before{content:""}.fa-sort-amount-desc:before{content:""}.fa-sort-numeric-asc:before{content:""}.fa-sort-numeric-desc:before{content:""}.fa-thumbs-up:before{content:""}.fa-thumbs-down:before{content:""}.fa-youtube-square:before{content:""}.fa-youtube:before{content:""}.fa-xing:before{content:""}.fa-xing-square:before{content:""}.fa-youtube-play:before{content:""}.fa-dropbox:before{content:""}.fa-stack-overflow:before{content:""}.fa-instagram:before{content:""}.fa-flickr:before{content:""}.fa-adn:before{content:""}.fa-bitbucket:before,.icon-bitbucket:before{content:""}.fa-bitbucket-square:before{content:""}.fa-tumblr:before{content:""}.fa-tumblr-square:before{content:""}.fa-long-arrow-down:before{content:""}.fa-long-arrow-up:before{content:""}.fa-long-arrow-left:before{content:""}.fa-long-arrow-right:before{content:""}.fa-apple:before{content:""}.fa-windows:before{content:""}.fa-android:before{content:""}.fa-linux:before{content:""}.fa-dribbble:before{content:""}.fa-skype:before{content:""}.fa-foursquare:before{content:""}.fa-trello:before{content:""}.fa-female:before{content:""}.fa-male:before{content:""}.fa-gittip:before,.fa-gratipay:before{content:""}.fa-sun-o:before{content:""}.fa-moon-o:before{content:""}.fa-archive:before{content:""}.fa-bug:before{content:""}.fa-vk:before{content:""}.fa-weibo:before{content:""}.fa-renren:before{content:""}.fa-pagelines:before{content:""}.fa-stack-exchange:before{content:""}.fa-arrow-circle-o-right:before{content:""}.fa-arrow-circle-o-left:before{content:""}.fa-caret-square-o-left:before,.fa-toggle-left:before{content:""}.fa-dot-circle-o:before{content:""}.fa-wheelchair:before{content:""}.fa-vimeo-square:before{content:""}.fa-try:before,.fa-turkish-lira:before{content:""}.fa-plus-square-o:before,.wy-menu-vertical li button.toctree-expand:before{content:""}.fa-space-shuttle:before{content:""}.fa-slack:before{content:""}.fa-envelope-square:before{content:""}.fa-wordpress:before{content:""}.fa-openid:before{content:""}.fa-bank:before,.fa-institution:before,.fa-university:before{content:""}.fa-graduation-cap:before,.fa-mortar-board:before{content:""}.fa-yahoo:before{content:""}.fa-google:before{content:""}.fa-reddit:before{content:""}.fa-reddit-square:before{content:""}.fa-stumbleupon-circle:before{content:""}.fa-stumbleupon:before{content:""}.fa-delicious:before{content:""}.fa-digg:before{content:""}.fa-pied-piper-pp:before{content:""}.fa-pied-piper-alt:before{content:""}.fa-drupal:before{content:""}.fa-joomla:before{content:""}.fa-language:before{content:""}.fa-fax:before{content:""}.fa-building:before{content:""}.fa-child:before{content:""}.fa-paw:before{content:""}.fa-spoon:before{content:""}.fa-cube:before{content:""}.fa-cubes:before{content:""}.fa-behance:before{content:""}.fa-behance-square:before{content:""}.fa-steam:before{content:""}.fa-steam-square:before{content:""}.fa-recycle:before{content:""}.fa-automobile:before,.fa-car:before{content:""}.fa-cab:before,.fa-taxi:before{content:""}.fa-tree:before{content:""}.fa-spotify:before{content:""}.fa-deviantart:before{content:""}.fa-soundcloud:before{content:""}.fa-database:before{content:""}.fa-file-pdf-o:before{content:""}.fa-file-word-o:before{content:""}.fa-file-excel-o:before{content:""}.fa-file-powerpoint-o:before{content:""}.fa-file-image-o:before,.fa-file-photo-o:before,.fa-file-picture-o:before{content:""}.fa-file-archive-o:before,.fa-file-zip-o:before{content:""}.fa-file-audio-o:before,.fa-file-sound-o:before{content:""}.fa-file-movie-o:before,.fa-file-video-o:before{content:""}.fa-file-code-o:before{content:""}.fa-vine:before{content:""}.fa-codepen:before{content:""}.fa-jsfiddle:before{content:""}.fa-life-bouy:before,.fa-life-buoy:before,.fa-life-ring:before,.fa-life-saver:before,.fa-support:before{content:""}.fa-circle-o-notch:before{content:""}.fa-ra:before,.fa-rebel:before,.fa-resistance:before{content:""}.fa-empire:before,.fa-ge:before{content:""}.fa-git-square:before{content:""}.fa-git:before{content:""}.fa-hacker-news:before,.fa-y-combinator-square:before,.fa-yc-square:before{content:""}.fa-tencent-weibo:before{content:""}.fa-qq:before{content:""}.fa-wechat:before,.fa-weixin:before{content:""}.fa-paper-plane:before,.fa-send:before{content:""}.fa-paper-plane-o:before,.fa-send-o:before{content:""}.fa-history:before{content:""}.fa-circle-thin:before{content:""}.fa-header:before{content:""}.fa-paragraph:before{content:""}.fa-sliders:before{content:""}.fa-share-alt:before{content:""}.fa-share-alt-square:before{content:""}.fa-bomb:before{content:""}.fa-futbol-o:before,.fa-soccer-ball-o:before{content:""}.fa-tty:before{content:""}.fa-binoculars:before{content:""}.fa-plug:before{content:""}.fa-slideshare:before{content:""}.fa-twitch:before{content:""}.fa-yelp:before{content:""}.fa-newspaper-o:before{content:""}.fa-wifi:before{content:""}.fa-calculator:before{content:""}.fa-paypal:before{content:""}.fa-google-wallet:before{content:""}.fa-cc-visa:before{content:""}.fa-cc-mastercard:before{content:""}.fa-cc-discover:before{content:""}.fa-cc-amex:before{content:""}.fa-cc-paypal:before{content:""}.fa-cc-stripe:before{content:""}.fa-bell-slash:before{content:""}.fa-bell-slash-o:before{content:""}.fa-trash:before{content:""}.fa-copyright:before{content:""}.fa-at:before{content:""}.fa-eyedropper:before{content:""}.fa-paint-brush:before{content:""}.fa-birthday-cake:before{content:""}.fa-area-chart:before{content:""}.fa-pie-chart:before{content:""}.fa-line-chart:before{content:""}.fa-lastfm:before{content:""}.fa-lastfm-square:before{content:""}.fa-toggle-off:before{content:""}.fa-toggle-on:before{content:""}.fa-bicycle:before{content:""}.fa-bus:before{content:""}.fa-ioxhost:before{content:""}.fa-angellist:before{content:""}.fa-cc:before{content:""}.fa-ils:before,.fa-shekel:before,.fa-sheqel:before{content:""}.fa-meanpath:before{content:""}.fa-buysellads:before{content:""}.fa-connectdevelop:before{content:""}.fa-dashcube:before{content:""}.fa-forumbee:before{content:""}.fa-leanpub:before{content:""}.fa-sellsy:before{content:""}.fa-shirtsinbulk:before{content:""}.fa-simplybuilt:before{content:""}.fa-skyatlas:before{content:""}.fa-cart-plus:before{content:""}.fa-cart-arrow-down:before{content:""}.fa-diamond:before{content:""}.fa-ship:before{content:""}.fa-user-secret:before{content:""}.fa-motorcycle:before{content:""}.fa-street-view:before{content:""}.fa-heartbeat:before{content:""}.fa-venus:before{content:""}.fa-mars:before{content:""}.fa-mercury:before{content:""}.fa-intersex:before,.fa-transgender:before{content:""}.fa-transgender-alt:before{content:""}.fa-venus-double:before{content:""}.fa-mars-double:before{content:""}.fa-venus-mars:before{content:""}.fa-mars-stroke:before{content:""}.fa-mars-stroke-v:before{content:""}.fa-mars-stroke-h:before{content:""}.fa-neuter:before{content:""}.fa-genderless:before{content:""}.fa-facebook-official:before{content:""}.fa-pinterest-p:before{content:""}.fa-whatsapp:before{content:""}.fa-server:before{content:""}.fa-user-plus:before{content:""}.fa-user-times:before{content:""}.fa-bed:before,.fa-hotel:before{content:""}.fa-viacoin:before{content:""}.fa-train:before{content:""}.fa-subway:before{content:""}.fa-medium:before{content:""}.fa-y-combinator:before,.fa-yc:before{content:""}.fa-optin-monster:before{content:""}.fa-opencart:before{content:""}.fa-expeditedssl:before{content:""}.fa-battery-4:before,.fa-battery-full:before,.fa-battery:before{content:""}.fa-battery-3:before,.fa-battery-three-quarters:before{content:""}.fa-battery-2:before,.fa-battery-half:before{content:""}.fa-battery-1:before,.fa-battery-quarter:before{content:""}.fa-battery-0:before,.fa-battery-empty:before{content:""}.fa-mouse-pointer:before{content:""}.fa-i-cursor:before{content:""}.fa-object-group:before{content:""}.fa-object-ungroup:before{content:""}.fa-sticky-note:before{content:""}.fa-sticky-note-o:before{content:""}.fa-cc-jcb:before{content:""}.fa-cc-diners-club:before{content:""}.fa-clone:before{content:""}.fa-balance-scale:before{content:""}.fa-hourglass-o:before{content:""}.fa-hourglass-1:before,.fa-hourglass-start:before{content:""}.fa-hourglass-2:before,.fa-hourglass-half:before{content:""}.fa-hourglass-3:before,.fa-hourglass-end:before{content:""}.fa-hourglass:before{content:""}.fa-hand-grab-o:before,.fa-hand-rock-o:before{content:""}.fa-hand-paper-o:before,.fa-hand-stop-o:before{content:""}.fa-hand-scissors-o:before{content:""}.fa-hand-lizard-o:before{content:""}.fa-hand-spock-o:before{content:""}.fa-hand-pointer-o:before{content:""}.fa-hand-peace-o:before{content:""}.fa-trademark:before{content:""}.fa-registered:before{content:""}.fa-creative-commons:before{content:""}.fa-gg:before{content:""}.fa-gg-circle:before{content:""}.fa-tripadvisor:before{content:""}.fa-odnoklassniki:before{content:""}.fa-odnoklassniki-square:before{content:""}.fa-get-pocket:before{content:""}.fa-wikipedia-w:before{content:""}.fa-safari:before{content:""}.fa-chrome:before{content:""}.fa-firefox:before{content:""}.fa-opera:before{content:""}.fa-internet-explorer:before{content:""}.fa-television:before,.fa-tv:before{content:""}.fa-contao:before{content:""}.fa-500px:before{content:""}.fa-amazon:before{content:""}.fa-calendar-plus-o:before{content:""}.fa-calendar-minus-o:before{content:""}.fa-calendar-times-o:before{content:""}.fa-calendar-check-o:before{content:""}.fa-industry:before{content:""}.fa-map-pin:before{content:""}.fa-map-signs:before{content:""}.fa-map-o:before{content:""}.fa-map:before{content:""}.fa-commenting:before{content:""}.fa-commenting-o:before{content:""}.fa-houzz:before{content:""}.fa-vimeo:before{content:""}.fa-black-tie:before{content:""}.fa-fonticons:before{content:""}.fa-reddit-alien:before{content:""}.fa-edge:before{content:""}.fa-credit-card-alt:before{content:""}.fa-codiepie:before{content:""}.fa-modx:before{content:""}.fa-fort-awesome:before{content:""}.fa-usb:before{content:""}.fa-product-hunt:before{content:""}.fa-mixcloud:before{content:""}.fa-scribd:before{content:""}.fa-pause-circle:before{content:""}.fa-pause-circle-o:before{content:""}.fa-stop-circle:before{content:""}.fa-stop-circle-o:before{content:""}.fa-shopping-bag:before{content:""}.fa-shopping-basket:before{content:""}.fa-hashtag:before{content:""}.fa-bluetooth:before{content:""}.fa-bluetooth-b:before{content:""}.fa-percent:before{content:""}.fa-gitlab:before,.icon-gitlab:before{content:""}.fa-wpbeginner:before{content:""}.fa-wpforms:before{content:""}.fa-envira:before{content:""}.fa-universal-access:before{content:""}.fa-wheelchair-alt:before{content:""}.fa-question-circle-o:before{content:""}.fa-blind:before{content:""}.fa-audio-description:before{content:""}.fa-volume-control-phone:before{content:""}.fa-braille:before{content:""}.fa-assistive-listening-systems:before{content:""}.fa-american-sign-language-interpreting:before,.fa-asl-interpreting:before{content:""}.fa-deaf:before,.fa-deafness:before,.fa-hard-of-hearing:before{content:""}.fa-glide:before{content:""}.fa-glide-g:before{content:""}.fa-sign-language:before,.fa-signing:before{content:""}.fa-low-vision:before{content:""}.fa-viadeo:before{content:""}.fa-viadeo-square:before{content:""}.fa-snapchat:before{content:""}.fa-snapchat-ghost:before{content:""}.fa-snapchat-square:before{content:""}.fa-pied-piper:before{content:""}.fa-first-order:before{content:""}.fa-yoast:before{content:""}.fa-themeisle:before{content:""}.fa-google-plus-circle:before,.fa-google-plus-official:before{content:""}.fa-fa:before,.fa-font-awesome:before{content:""}.fa-handshake-o:before{content:""}.fa-envelope-open:before{content:""}.fa-envelope-open-o:before{content:""}.fa-linode:before{content:""}.fa-address-book:before{content:""}.fa-address-book-o:before{content:""}.fa-address-card:before,.fa-vcard:before{content:""}.fa-address-card-o:before,.fa-vcard-o:before{content:""}.fa-user-circle:before{content:""}.fa-user-circle-o:before{content:""}.fa-user-o:before{content:""}.fa-id-badge:before{content:""}.fa-drivers-license:before,.fa-id-card:before{content:""}.fa-drivers-license-o:before,.fa-id-card-o:before{content:""}.fa-quora:before{content:""}.fa-free-code-camp:before{content:""}.fa-telegram:before{content:""}.fa-thermometer-4:before,.fa-thermometer-full:before,.fa-thermometer:before{content:""}.fa-thermometer-3:before,.fa-thermometer-three-quarters:before{content:""}.fa-thermometer-2:before,.fa-thermometer-half:before{content:""}.fa-thermometer-1:before,.fa-thermometer-quarter:before{content:""}.fa-thermometer-0:before,.fa-thermometer-empty:before{content:""}.fa-shower:before{content:""}.fa-bath:before,.fa-bathtub:before,.fa-s15:before{content:""}.fa-podcast:before{content:""}.fa-window-maximize:before{content:""}.fa-window-minimize:before{content:""}.fa-window-restore:before{content:""}.fa-times-rectangle:before,.fa-window-close:before{content:""}.fa-times-rectangle-o:before,.fa-window-close-o:before{content:""}.fa-bandcamp:before{content:""}.fa-grav:before{content:""}.fa-etsy:before{content:""}.fa-imdb:before{content:""}.fa-ravelry:before{content:""}.fa-eercast:before{content:""}.fa-microchip:before{content:""}.fa-snowflake-o:before{content:""}.fa-superpowers:before{content:""}.fa-wpexplorer:before{content:""}.fa-meetup:before{content:""}.sr-only{position:absolute;width:1px;height:1px;padding:0;margin:-1px;overflow:hidden;clip:rect(0,0,0,0);border:0}.sr-only-focusable:active,.sr-only-focusable:focus{position:static;width:auto;height:auto;margin:0;overflow:visible;clip:auto}.fa,.icon,.rst-content .admonition-title,.rst-content .code-block-caption .headerlink,.rst-content .eqno .headerlink,.rst-content code.download span:first-child,.rst-content dl dt .headerlink,.rst-content h1 .headerlink,.rst-content h2 .headerlink,.rst-content h3 .headerlink,.rst-content h4 .headerlink,.rst-content h5 .headerlink,.rst-content h6 .headerlink,.rst-content p.caption .headerlink,.rst-content p .headerlink,.rst-content table>caption .headerlink,.rst-content tt.download span:first-child,.wy-dropdown .caret,.wy-inline-validate.wy-inline-validate-danger .wy-input-context,.wy-inline-validate.wy-inline-validate-info .wy-input-context,.wy-inline-validate.wy-inline-validate-success .wy-input-context,.wy-inline-validate.wy-inline-validate-warning .wy-input-context,.wy-menu-vertical li.current>a button.toctree-expand,.wy-menu-vertical li.on a button.toctree-expand,.wy-menu-vertical li button.toctree-expand{font-family:inherit}.fa:before,.icon:before,.rst-content .admonition-title:before,.rst-content .code-block-caption .headerlink:before,.rst-content .eqno .headerlink:before,.rst-content code.download span:first-child:before,.rst-content dl dt .headerlink:before,.rst-content h1 .headerlink:before,.rst-content h2 .headerlink:before,.rst-content h3 .headerlink:before,.rst-content h4 .headerlink:before,.rst-content h5 .headerlink:before,.rst-content h6 .headerlink:before,.rst-content p.caption .headerlink:before,.rst-content p .headerlink:before,.rst-content table>caption .headerlink:before,.rst-content tt.download span:first-child:before,.wy-dropdown .caret:before,.wy-inline-validate.wy-inline-validate-danger .wy-input-context:before,.wy-inline-validate.wy-inline-validate-info .wy-input-context:before,.wy-inline-validate.wy-inline-validate-success .wy-input-context:before,.wy-inline-validate.wy-inline-validate-warning .wy-input-context:before,.wy-menu-vertical li.current>a button.toctree-expand:before,.wy-menu-vertical li.on a button.toctree-expand:before,.wy-menu-vertical li button.toctree-expand:before{font-family:FontAwesome;display:inline-block;font-style:normal;font-weight:400;line-height:1;text-decoration:inherit}.rst-content .code-block-caption a .headerlink,.rst-content .eqno a .headerlink,.rst-content a .admonition-title,.rst-content code.download a span:first-child,.rst-content dl dt a .headerlink,.rst-content h1 a .headerlink,.rst-content h2 a .headerlink,.rst-content h3 a .headerlink,.rst-content h4 a .headerlink,.rst-content h5 a .headerlink,.rst-content h6 a .headerlink,.rst-content p.caption a .headerlink,.rst-content p a .headerlink,.rst-content table>caption a .headerlink,.rst-content tt.download a span:first-child,.wy-menu-vertical li.current>a button.toctree-expand,.wy-menu-vertical li.on a button.toctree-expand,.wy-menu-vertical li a button.toctree-expand,a .fa,a .icon,a .rst-content .admonition-title,a .rst-content .code-block-caption .headerlink,a .rst-content .eqno .headerlink,a .rst-content code.download span:first-child,a .rst-content dl dt .headerlink,a .rst-content h1 .headerlink,a .rst-content h2 .headerlink,a .rst-content h3 .headerlink,a .rst-content h4 .headerlink,a .rst-content h5 .headerlink,a .rst-content h6 .headerlink,a .rst-content p.caption .headerlink,a .rst-content p .headerlink,a .rst-content table>caption .headerlink,a .rst-content tt.download span:first-child,a .wy-menu-vertical li button.toctree-expand{display:inline-block;text-decoration:inherit}.btn .fa,.btn .icon,.btn .rst-content .admonition-title,.btn .rst-content .code-block-caption .headerlink,.btn .rst-content .eqno .headerlink,.btn .rst-content code.download span:first-child,.btn .rst-content dl dt .headerlink,.btn .rst-content h1 .headerlink,.btn .rst-content h2 .headerlink,.btn .rst-content h3 .headerlink,.btn .rst-content h4 .headerlink,.btn .rst-content h5 .headerlink,.btn .rst-content h6 .headerlink,.btn .rst-content p .headerlink,.btn .rst-content table>caption .headerlink,.btn .rst-content tt.download span:first-child,.btn .wy-menu-vertical li.current>a button.toctree-expand,.btn .wy-menu-vertical li.on a button.toctree-expand,.btn .wy-menu-vertical li button.toctree-expand,.nav .fa,.nav .icon,.nav .rst-content .admonition-title,.nav .rst-content .code-block-caption .headerlink,.nav .rst-content .eqno .headerlink,.nav .rst-content code.download span:first-child,.nav .rst-content dl dt .headerlink,.nav .rst-content h1 .headerlink,.nav .rst-content h2 .headerlink,.nav .rst-content h3 .headerlink,.nav .rst-content h4 .headerlink,.nav .rst-content h5 .headerlink,.nav .rst-content h6 .headerlink,.nav .rst-content p .headerlink,.nav .rst-content table>caption .headerlink,.nav .rst-content tt.download span:first-child,.nav .wy-menu-vertical li.current>a button.toctree-expand,.nav .wy-menu-vertical li.on a button.toctree-expand,.nav .wy-menu-vertical li button.toctree-expand,.rst-content .btn .admonition-title,.rst-content .code-block-caption .btn .headerlink,.rst-content .code-block-caption .nav .headerlink,.rst-content .eqno .btn .headerlink,.rst-content .eqno .nav .headerlink,.rst-content .nav .admonition-title,.rst-content code.download .btn span:first-child,.rst-content code.download .nav span:first-child,.rst-content dl dt .btn .headerlink,.rst-content dl dt .nav .headerlink,.rst-content h1 .btn .headerlink,.rst-content h1 .nav .headerlink,.rst-content h2 .btn .headerlink,.rst-content h2 .nav .headerlink,.rst-content h3 .btn .headerlink,.rst-content h3 .nav .headerlink,.rst-content h4 .btn .headerlink,.rst-content h4 .nav .headerlink,.rst-content h5 .btn .headerlink,.rst-content h5 .nav .headerlink,.rst-content h6 .btn .headerlink,.rst-content h6 .nav .headerlink,.rst-content p .btn .headerlink,.rst-content p .nav .headerlink,.rst-content table>caption .btn .headerlink,.rst-content table>caption .nav .headerlink,.rst-content tt.download .btn span:first-child,.rst-content tt.download .nav span:first-child,.wy-menu-vertical li .btn button.toctree-expand,.wy-menu-vertical li.current>a .btn button.toctree-expand,.wy-menu-vertical li.current>a .nav button.toctree-expand,.wy-menu-vertical li .nav button.toctree-expand,.wy-menu-vertical li.on a .btn button.toctree-expand,.wy-menu-vertical li.on a .nav button.toctree-expand{display:inline}.btn .fa-large.icon,.btn .fa.fa-large,.btn .rst-content .code-block-caption .fa-large.headerlink,.btn .rst-content .eqno .fa-large.headerlink,.btn .rst-content .fa-large.admonition-title,.btn .rst-content code.download span.fa-large:first-child,.btn .rst-content dl dt .fa-large.headerlink,.btn .rst-content h1 .fa-large.headerlink,.btn .rst-content h2 .fa-large.headerlink,.btn .rst-content h3 .fa-large.headerlink,.btn .rst-content h4 .fa-large.headerlink,.btn .rst-content h5 .fa-large.headerlink,.btn .rst-content h6 .fa-large.headerlink,.btn .rst-content p .fa-large.headerlink,.btn .rst-content table>caption .fa-large.headerlink,.btn .rst-content tt.download span.fa-large:first-child,.btn .wy-menu-vertical li button.fa-large.toctree-expand,.nav .fa-large.icon,.nav .fa.fa-large,.nav .rst-content .code-block-caption .fa-large.headerlink,.nav .rst-content .eqno .fa-large.headerlink,.nav .rst-content .fa-large.admonition-title,.nav .rst-content code.download span.fa-large:first-child,.nav .rst-content dl dt .fa-large.headerlink,.nav .rst-content h1 .fa-large.headerlink,.nav .rst-content h2 .fa-large.headerlink,.nav .rst-content h3 .fa-large.headerlink,.nav .rst-content h4 .fa-large.headerlink,.nav .rst-content h5 .fa-large.headerlink,.nav .rst-content h6 .fa-large.headerlink,.nav .rst-content p .fa-large.headerlink,.nav .rst-content table>caption .fa-large.headerlink,.nav .rst-content tt.download span.fa-large:first-child,.nav .wy-menu-vertical li button.fa-large.toctree-expand,.rst-content .btn .fa-large.admonition-title,.rst-content .code-block-caption .btn .fa-large.headerlink,.rst-content .code-block-caption .nav .fa-large.headerlink,.rst-content .eqno .btn .fa-large.headerlink,.rst-content .eqno .nav .fa-large.headerlink,.rst-content .nav .fa-large.admonition-title,.rst-content code.download .btn span.fa-large:first-child,.rst-content code.download .nav span.fa-large:first-child,.rst-content dl dt .btn .fa-large.headerlink,.rst-content dl dt .nav .fa-large.headerlink,.rst-content h1 .btn .fa-large.headerlink,.rst-content h1 .nav .fa-large.headerlink,.rst-content h2 .btn .fa-large.headerlink,.rst-content h2 .nav .fa-large.headerlink,.rst-content h3 .btn .fa-large.headerlink,.rst-content h3 .nav .fa-large.headerlink,.rst-content h4 .btn .fa-large.headerlink,.rst-content h4 .nav .fa-large.headerlink,.rst-content h5 .btn .fa-large.headerlink,.rst-content h5 .nav .fa-large.headerlink,.rst-content h6 .btn .fa-large.headerlink,.rst-content h6 .nav .fa-large.headerlink,.rst-content p .btn .fa-large.headerlink,.rst-content p .nav .fa-large.headerlink,.rst-content table>caption .btn .fa-large.headerlink,.rst-content table>caption .nav .fa-large.headerlink,.rst-content tt.download .btn span.fa-large:first-child,.rst-content tt.download .nav span.fa-large:first-child,.wy-menu-vertical li .btn button.fa-large.toctree-expand,.wy-menu-vertical li .nav button.fa-large.toctree-expand{line-height:.9em}.btn .fa-spin.icon,.btn .fa.fa-spin,.btn .rst-content .code-block-caption .fa-spin.headerlink,.btn .rst-content .eqno .fa-spin.headerlink,.btn .rst-content .fa-spin.admonition-title,.btn .rst-content code.download span.fa-spin:first-child,.btn .rst-content dl dt .fa-spin.headerlink,.btn .rst-content h1 .fa-spin.headerlink,.btn .rst-content h2 .fa-spin.headerlink,.btn .rst-content h3 .fa-spin.headerlink,.btn .rst-content h4 .fa-spin.headerlink,.btn .rst-content h5 .fa-spin.headerlink,.btn .rst-content h6 .fa-spin.headerlink,.btn .rst-content p .fa-spin.headerlink,.btn .rst-content table>caption .fa-spin.headerlink,.btn .rst-content tt.download span.fa-spin:first-child,.btn .wy-menu-vertical li button.fa-spin.toctree-expand,.nav .fa-spin.icon,.nav .fa.fa-spin,.nav .rst-content .code-block-caption .fa-spin.headerlink,.nav .rst-content .eqno .fa-spin.headerlink,.nav .rst-content .fa-spin.admonition-title,.nav .rst-content code.download span.fa-spin:first-child,.nav .rst-content dl dt .fa-spin.headerlink,.nav .rst-content h1 .fa-spin.headerlink,.nav .rst-content h2 .fa-spin.headerlink,.nav .rst-content h3 .fa-spin.headerlink,.nav .rst-content h4 .fa-spin.headerlink,.nav .rst-content h5 .fa-spin.headerlink,.nav .rst-content h6 .fa-spin.headerlink,.nav .rst-content p .fa-spin.headerlink,.nav .rst-content table>caption .fa-spin.headerlink,.nav .rst-content tt.download span.fa-spin:first-child,.nav .wy-menu-vertical li button.fa-spin.toctree-expand,.rst-content .btn .fa-spin.admonition-title,.rst-content .code-block-caption .btn .fa-spin.headerlink,.rst-content .code-block-caption .nav .fa-spin.headerlink,.rst-content .eqno .btn .fa-spin.headerlink,.rst-content .eqno .nav .fa-spin.headerlink,.rst-content .nav .fa-spin.admonition-title,.rst-content code.download .btn span.fa-spin:first-child,.rst-content code.download .nav span.fa-spin:first-child,.rst-content dl dt .btn .fa-spin.headerlink,.rst-content dl dt .nav .fa-spin.headerlink,.rst-content h1 .btn .fa-spin.headerlink,.rst-content h1 .nav .fa-spin.headerlink,.rst-content h2 .btn .fa-spin.headerlink,.rst-content h2 .nav .fa-spin.headerlink,.rst-content h3 .btn .fa-spin.headerlink,.rst-content h3 .nav .fa-spin.headerlink,.rst-content h4 .btn .fa-spin.headerlink,.rst-content h4 .nav .fa-spin.headerlink,.rst-content h5 .btn .fa-spin.headerlink,.rst-content h5 .nav .fa-spin.headerlink,.rst-content h6 .btn .fa-spin.headerlink,.rst-content h6 .nav .fa-spin.headerlink,.rst-content p .btn .fa-spin.headerlink,.rst-content p .nav .fa-spin.headerlink,.rst-content table>caption .btn .fa-spin.headerlink,.rst-content table>caption .nav .fa-spin.headerlink,.rst-content tt.download .btn span.fa-spin:first-child,.rst-content tt.download .nav span.fa-spin:first-child,.wy-menu-vertical li .btn button.fa-spin.toctree-expand,.wy-menu-vertical li .nav button.fa-spin.toctree-expand{display:inline-block}.btn.fa:before,.btn.icon:before,.rst-content .btn.admonition-title:before,.rst-content .code-block-caption .btn.headerlink:before,.rst-content .eqno .btn.headerlink:before,.rst-content code.download span.btn:first-child:before,.rst-content dl dt .btn.headerlink:before,.rst-content h1 .btn.headerlink:before,.rst-content h2 .btn.headerlink:before,.rst-content h3 .btn.headerlink:before,.rst-content h4 .btn.headerlink:before,.rst-content h5 .btn.headerlink:before,.rst-content h6 .btn.headerlink:before,.rst-content p .btn.headerlink:before,.rst-content table>caption .btn.headerlink:before,.rst-content tt.download span.btn:first-child:before,.wy-menu-vertical li button.btn.toctree-expand:before{opacity:.5;-webkit-transition:opacity .05s ease-in;-moz-transition:opacity .05s ease-in;transition:opacity .05s ease-in}.btn.fa:hover:before,.btn.icon:hover:before,.rst-content .btn.admonition-title:hover:before,.rst-content .code-block-caption .btn.headerlink:hover:before,.rst-content .eqno .btn.headerlink:hover:before,.rst-content code.download span.btn:first-child:hover:before,.rst-content dl dt .btn.headerlink:hover:before,.rst-content h1 .btn.headerlink:hover:before,.rst-content h2 .btn.headerlink:hover:before,.rst-content h3 .btn.headerlink:hover:before,.rst-content h4 .btn.headerlink:hover:before,.rst-content h5 .btn.headerlink:hover:before,.rst-content h6 .btn.headerlink:hover:before,.rst-content p .btn.headerlink:hover:before,.rst-content table>caption .btn.headerlink:hover:before,.rst-content tt.download span.btn:first-child:hover:before,.wy-menu-vertical li button.btn.toctree-expand:hover:before{opacity:1}.btn-mini .fa:before,.btn-mini .icon:before,.btn-mini .rst-content .admonition-title:before,.btn-mini .rst-content .code-block-caption .headerlink:before,.btn-mini .rst-content .eqno .headerlink:before,.btn-mini .rst-content code.download span:first-child:before,.btn-mini .rst-content dl dt .headerlink:before,.btn-mini .rst-content h1 .headerlink:before,.btn-mini .rst-content h2 .headerlink:before,.btn-mini .rst-content h3 .headerlink:before,.btn-mini .rst-content h4 .headerlink:before,.btn-mini .rst-content h5 .headerlink:before,.btn-mini .rst-content h6 .headerlink:before,.btn-mini .rst-content p .headerlink:before,.btn-mini .rst-content table>caption .headerlink:before,.btn-mini .rst-content tt.download span:first-child:before,.btn-mini .wy-menu-vertical li button.toctree-expand:before,.rst-content .btn-mini .admonition-title:before,.rst-content .code-block-caption .btn-mini .headerlink:before,.rst-content .eqno .btn-mini .headerlink:before,.rst-content code.download .btn-mini span:first-child:before,.rst-content dl dt .btn-mini .headerlink:before,.rst-content h1 .btn-mini .headerlink:before,.rst-content h2 .btn-mini .headerlink:before,.rst-content h3 .btn-mini .headerlink:before,.rst-content h4 .btn-mini .headerlink:before,.rst-content h5 .btn-mini .headerlink:before,.rst-content h6 .btn-mini .headerlink:before,.rst-content p .btn-mini .headerlink:before,.rst-content table>caption .btn-mini .headerlink:before,.rst-content tt.download .btn-mini span:first-child:before,.wy-menu-vertical li .btn-mini button.toctree-expand:before{font-size:14px;vertical-align:-15%}.rst-content .admonition,.rst-content .admonition-todo,.rst-content .attention,.rst-content .caution,.rst-content .danger,.rst-content .error,.rst-content .hint,.rst-content .important,.rst-content .note,.rst-content .seealso,.rst-content .tip,.rst-content .warning,.wy-alert{padding:12px;line-height:24px;margin-bottom:24px;background:#e7f2fa}.rst-content .admonition-title,.wy-alert-title{font-weight:700;display:block;color:#fff;background:#6ab0de;padding:6px 12px;margin:-12px -12px 12px}.rst-content .danger,.rst-content .error,.rst-content .wy-alert-danger.admonition,.rst-content .wy-alert-danger.admonition-todo,.rst-content .wy-alert-danger.attention,.rst-content .wy-alert-danger.caution,.rst-content .wy-alert-danger.hint,.rst-content .wy-alert-danger.important,.rst-content .wy-alert-danger.note,.rst-content .wy-alert-danger.seealso,.rst-content .wy-alert-danger.tip,.rst-content .wy-alert-danger.warning,.wy-alert.wy-alert-danger{background:#fdf3f2}.rst-content .danger .admonition-title,.rst-content .danger .wy-alert-title,.rst-content .error .admonition-title,.rst-content .error .wy-alert-title,.rst-content .wy-alert-danger.admonition-todo .admonition-title,.rst-content .wy-alert-danger.admonition-todo .wy-alert-title,.rst-content .wy-alert-danger.admonition .admonition-title,.rst-content .wy-alert-danger.admonition .wy-alert-title,.rst-content .wy-alert-danger.attention .admonition-title,.rst-content .wy-alert-danger.attention .wy-alert-title,.rst-content .wy-alert-danger.caution .admonition-title,.rst-content .wy-alert-danger.caution .wy-alert-title,.rst-content .wy-alert-danger.hint .admonition-title,.rst-content .wy-alert-danger.hint .wy-alert-title,.rst-content .wy-alert-danger.important .admonition-title,.rst-content .wy-alert-danger.important .wy-alert-title,.rst-content .wy-alert-danger.note .admonition-title,.rst-content .wy-alert-danger.note .wy-alert-title,.rst-content .wy-alert-danger.seealso .admonition-title,.rst-content .wy-alert-danger.seealso .wy-alert-title,.rst-content .wy-alert-danger.tip .admonition-title,.rst-content .wy-alert-danger.tip .wy-alert-title,.rst-content .wy-alert-danger.warning .admonition-title,.rst-content .wy-alert-danger.warning .wy-alert-title,.rst-content .wy-alert.wy-alert-danger .admonition-title,.wy-alert.wy-alert-danger .rst-content .admonition-title,.wy-alert.wy-alert-danger .wy-alert-title{background:#f29f97}.rst-content .admonition-todo,.rst-content .attention,.rst-content .caution,.rst-content .warning,.rst-content .wy-alert-warning.admonition,.rst-content .wy-alert-warning.danger,.rst-content .wy-alert-warning.error,.rst-content .wy-alert-warning.hint,.rst-content .wy-alert-warning.important,.rst-content .wy-alert-warning.note,.rst-content .wy-alert-warning.seealso,.rst-content .wy-alert-warning.tip,.wy-alert.wy-alert-warning{background:#ffedcc}.rst-content .admonition-todo .admonition-title,.rst-content .admonition-todo .wy-alert-title,.rst-content .attention .admonition-title,.rst-content .attention .wy-alert-title,.rst-content .caution .admonition-title,.rst-content .caution .wy-alert-title,.rst-content .warning .admonition-title,.rst-content .warning .wy-alert-title,.rst-content .wy-alert-warning.admonition .admonition-title,.rst-content .wy-alert-warning.admonition .wy-alert-title,.rst-content .wy-alert-warning.danger .admonition-title,.rst-content .wy-alert-warning.danger .wy-alert-title,.rst-content .wy-alert-warning.error .admonition-title,.rst-content .wy-alert-warning.error .wy-alert-title,.rst-content .wy-alert-warning.hint .admonition-title,.rst-content .wy-alert-warning.hint .wy-alert-title,.rst-content .wy-alert-warning.important .admonition-title,.rst-content .wy-alert-warning.important .wy-alert-title,.rst-content .wy-alert-warning.note .admonition-title,.rst-content .wy-alert-warning.note .wy-alert-title,.rst-content .wy-alert-warning.seealso .admonition-title,.rst-content .wy-alert-warning.seealso .wy-alert-title,.rst-content .wy-alert-warning.tip .admonition-title,.rst-content .wy-alert-warning.tip .wy-alert-title,.rst-content .wy-alert.wy-alert-warning .admonition-title,.wy-alert.wy-alert-warning .rst-content .admonition-title,.wy-alert.wy-alert-warning .wy-alert-title{background:#f0b37e}.rst-content .note,.rst-content .seealso,.rst-content .wy-alert-info.admonition,.rst-content .wy-alert-info.admonition-todo,.rst-content .wy-alert-info.attention,.rst-content .wy-alert-info.caution,.rst-content .wy-alert-info.danger,.rst-content .wy-alert-info.error,.rst-content .wy-alert-info.hint,.rst-content .wy-alert-info.important,.rst-content .wy-alert-info.tip,.rst-content .wy-alert-info.warning,.wy-alert.wy-alert-info{background:#e7f2fa}.rst-content .note .admonition-title,.rst-content .note .wy-alert-title,.rst-content .seealso .admonition-title,.rst-content .seealso .wy-alert-title,.rst-content .wy-alert-info.admonition-todo .admonition-title,.rst-content .wy-alert-info.admonition-todo .wy-alert-title,.rst-content .wy-alert-info.admonition .admonition-title,.rst-content .wy-alert-info.admonition .wy-alert-title,.rst-content .wy-alert-info.attention .admonition-title,.rst-content .wy-alert-info.attention .wy-alert-title,.rst-content .wy-alert-info.caution .admonition-title,.rst-content .wy-alert-info.caution .wy-alert-title,.rst-content .wy-alert-info.danger .admonition-title,.rst-content .wy-alert-info.danger .wy-alert-title,.rst-content .wy-alert-info.error .admonition-title,.rst-content .wy-alert-info.error .wy-alert-title,.rst-content .wy-alert-info.hint .admonition-title,.rst-content .wy-alert-info.hint .wy-alert-title,.rst-content .wy-alert-info.important .admonition-title,.rst-content .wy-alert-info.important .wy-alert-title,.rst-content .wy-alert-info.tip .admonition-title,.rst-content .wy-alert-info.tip .wy-alert-title,.rst-content .wy-alert-info.warning .admonition-title,.rst-content .wy-alert-info.warning .wy-alert-title,.rst-content .wy-alert.wy-alert-info .admonition-title,.wy-alert.wy-alert-info .rst-content .admonition-title,.wy-alert.wy-alert-info .wy-alert-title{background:#6ab0de}.rst-content .hint,.rst-content .important,.rst-content .tip,.rst-content .wy-alert-success.admonition,.rst-content .wy-alert-success.admonition-todo,.rst-content .wy-alert-success.attention,.rst-content .wy-alert-success.caution,.rst-content .wy-alert-success.danger,.rst-content .wy-alert-success.error,.rst-content .wy-alert-success.note,.rst-content .wy-alert-success.seealso,.rst-content .wy-alert-success.warning,.wy-alert.wy-alert-success{background:#dbfaf4}.rst-content .hint .admonition-title,.rst-content .hint .wy-alert-title,.rst-content .important .admonition-title,.rst-content .important .wy-alert-title,.rst-content .tip .admonition-title,.rst-content .tip .wy-alert-title,.rst-content .wy-alert-success.admonition-todo .admonition-title,.rst-content .wy-alert-success.admonition-todo .wy-alert-title,.rst-content .wy-alert-success.admonition .admonition-title,.rst-content .wy-alert-success.admonition .wy-alert-title,.rst-content .wy-alert-success.attention .admonition-title,.rst-content .wy-alert-success.attention .wy-alert-title,.rst-content .wy-alert-success.caution .admonition-title,.rst-content .wy-alert-success.caution .wy-alert-title,.rst-content .wy-alert-success.danger .admonition-title,.rst-content .wy-alert-success.danger .wy-alert-title,.rst-content .wy-alert-success.error .admonition-title,.rst-content .wy-alert-success.error .wy-alert-title,.rst-content .wy-alert-success.note .admonition-title,.rst-content .wy-alert-success.note .wy-alert-title,.rst-content .wy-alert-success.seealso .admonition-title,.rst-content .wy-alert-success.seealso .wy-alert-title,.rst-content .wy-alert-success.warning .admonition-title,.rst-content .wy-alert-success.warning .wy-alert-title,.rst-content .wy-alert.wy-alert-success .admonition-title,.wy-alert.wy-alert-success .rst-content .admonition-title,.wy-alert.wy-alert-success .wy-alert-title{background:#1abc9c}.rst-content .wy-alert-neutral.admonition,.rst-content .wy-alert-neutral.admonition-todo,.rst-content .wy-alert-neutral.attention,.rst-content .wy-alert-neutral.caution,.rst-content .wy-alert-neutral.danger,.rst-content .wy-alert-neutral.error,.rst-content .wy-alert-neutral.hint,.rst-content .wy-alert-neutral.important,.rst-content .wy-alert-neutral.note,.rst-content .wy-alert-neutral.seealso,.rst-content .wy-alert-neutral.tip,.rst-content .wy-alert-neutral.warning,.wy-alert.wy-alert-neutral{background:#f3f6f6}.rst-content .wy-alert-neutral.admonition-todo .admonition-title,.rst-content .wy-alert-neutral.admonition-todo .wy-alert-title,.rst-content .wy-alert-neutral.admonition .admonition-title,.rst-content .wy-alert-neutral.admonition .wy-alert-title,.rst-content .wy-alert-neutral.attention .admonition-title,.rst-content .wy-alert-neutral.attention .wy-alert-title,.rst-content .wy-alert-neutral.caution .admonition-title,.rst-content .wy-alert-neutral.caution .wy-alert-title,.rst-content .wy-alert-neutral.danger .admonition-title,.rst-content .wy-alert-neutral.danger .wy-alert-title,.rst-content .wy-alert-neutral.error .admonition-title,.rst-content .wy-alert-neutral.error .wy-alert-title,.rst-content .wy-alert-neutral.hint .admonition-title,.rst-content .wy-alert-neutral.hint .wy-alert-title,.rst-content .wy-alert-neutral.important .admonition-title,.rst-content .wy-alert-neutral.important .wy-alert-title,.rst-content .wy-alert-neutral.note .admonition-title,.rst-content .wy-alert-neutral.note .wy-alert-title,.rst-content .wy-alert-neutral.seealso .admonition-title,.rst-content .wy-alert-neutral.seealso .wy-alert-title,.rst-content .wy-alert-neutral.tip .admonition-title,.rst-content .wy-alert-neutral.tip .wy-alert-title,.rst-content .wy-alert-neutral.warning .admonition-title,.rst-content .wy-alert-neutral.warning .wy-alert-title,.rst-content .wy-alert.wy-alert-neutral .admonition-title,.wy-alert.wy-alert-neutral .rst-content .admonition-title,.wy-alert.wy-alert-neutral .wy-alert-title{color:#404040;background:#e1e4e5}.rst-content .wy-alert-neutral.admonition-todo a,.rst-content .wy-alert-neutral.admonition a,.rst-content .wy-alert-neutral.attention a,.rst-content .wy-alert-neutral.caution a,.rst-content .wy-alert-neutral.danger a,.rst-content .wy-alert-neutral.error a,.rst-content .wy-alert-neutral.hint a,.rst-content .wy-alert-neutral.important a,.rst-content .wy-alert-neutral.note a,.rst-content .wy-alert-neutral.seealso a,.rst-content .wy-alert-neutral.tip a,.rst-content .wy-alert-neutral.warning a,.wy-alert.wy-alert-neutral a{color:#2980b9}.rst-content .admonition-todo p:last-child,.rst-content .admonition p:last-child,.rst-content .attention p:last-child,.rst-content .caution p:last-child,.rst-content .danger p:last-child,.rst-content .error p:last-child,.rst-content .hint p:last-child,.rst-content .important p:last-child,.rst-content .note p:last-child,.rst-content .seealso p:last-child,.rst-content .tip p:last-child,.rst-content .warning p:last-child,.wy-alert p:last-child{margin-bottom:0}.wy-tray-container{position:fixed;bottom:0;left:0;z-index:600}.wy-tray-container li{display:block;width:300px;background:transparent;color:#fff;text-align:center;box-shadow:0 5px 5px 0 rgba(0,0,0,.1);padding:0 24px;min-width:20%;opacity:0;height:0;line-height:56px;overflow:hidden;-webkit-transition:all .3s ease-in;-moz-transition:all .3s ease-in;transition:all .3s ease-in}.wy-tray-container li.wy-tray-item-success{background:#27ae60}.wy-tray-container li.wy-tray-item-info{background:#2980b9}.wy-tray-container li.wy-tray-item-warning{background:#e67e22}.wy-tray-container li.wy-tray-item-danger{background:#e74c3c}.wy-tray-container li.on{opacity:1;height:56px}@media screen and (max-width:768px){.wy-tray-container{bottom:auto;top:0;width:100%}.wy-tray-container li{width:100%}}button{font-size:100%;margin:0;vertical-align:baseline;*vertical-align:middle;cursor:pointer;line-height:normal;-webkit-appearance:button;*overflow:visible}button::-moz-focus-inner,input::-moz-focus-inner{border:0;padding:0}button[disabled]{cursor:default}.btn{display:inline-block;border-radius:2px;line-height:normal;white-space:nowrap;text-align:center;cursor:pointer;font-size:100%;padding:6px 12px 8px;color:#fff;border:1px solid rgba(0,0,0,.1);background-color:#27ae60;text-decoration:none;font-weight:400;font-family:Lato,proxima-nova,Helvetica Neue,Arial,sans-serif;box-shadow:inset 0 1px 2px -1px hsla(0,0%,100%,.5),inset 0 -2px 0 0 rgba(0,0,0,.1);outline-none:false;vertical-align:middle;*display:inline;zoom:1;-webkit-user-drag:none;-webkit-user-select:none;-moz-user-select:none;-ms-user-select:none;user-select:none;-webkit-transition:all .1s linear;-moz-transition:all .1s linear;transition:all .1s linear}.btn-hover{background:#2e8ece;color:#fff}.btn:hover{background:#2cc36b;color:#fff}.btn:focus{background:#2cc36b;outline:0}.btn:active{box-shadow:inset 0 -1px 0 0 rgba(0,0,0,.05),inset 0 2px 0 0 rgba(0,0,0,.1);padding:8px 12px 6px}.btn:visited{color:#fff}.btn-disabled,.btn-disabled:active,.btn-disabled:focus,.btn-disabled:hover,.btn:disabled{background-image:none;filter:progid:DXImageTransform.Microsoft.gradient(enabled = false);filter:alpha(opacity=40);opacity:.4;cursor:not-allowed;box-shadow:none}.btn::-moz-focus-inner{padding:0;border:0}.btn-small{font-size:80%}.btn-info{background-color:#2980b9!important}.btn-info:hover{background-color:#2e8ece!important}.btn-neutral{background-color:#f3f6f6!important;color:#404040!important}.btn-neutral:hover{background-color:#e5ebeb!important;color:#404040}.btn-neutral:visited{color:#404040!important}.btn-success{background-color:#27ae60!important}.btn-success:hover{background-color:#295!important}.btn-danger{background-color:#e74c3c!important}.btn-danger:hover{background-color:#ea6153!important}.btn-warning{background-color:#e67e22!important}.btn-warning:hover{background-color:#e98b39!important}.btn-invert{background-color:#222}.btn-invert:hover{background-color:#2f2f2f!important}.btn-link{background-color:transparent!important;color:#2980b9;box-shadow:none;border-color:transparent!important}.btn-link:active,.btn-link:hover{background-color:transparent!important;color:#409ad5!important;box-shadow:none}.btn-link:visited{color:#9b59b6}.wy-btn-group .btn,.wy-control .btn{vertical-align:middle}.wy-btn-group{margin-bottom:24px;*zoom:1}.wy-btn-group:after,.wy-btn-group:before{display:table;content:""}.wy-btn-group:after{clear:both}.wy-dropdown{position:relative;display:inline-block}.wy-dropdown-active .wy-dropdown-menu{display:block}.wy-dropdown-menu{position:absolute;left:0;display:none;float:left;top:100%;min-width:100%;background:#fcfcfc;z-index:100;border:1px solid #cfd7dd;box-shadow:0 2px 2px 0 rgba(0,0,0,.1);padding:12px}.wy-dropdown-menu>dd>a{display:block;clear:both;color:#404040;white-space:nowrap;font-size:90%;padding:0 12px;cursor:pointer}.wy-dropdown-menu>dd>a:hover{background:#2980b9;color:#fff}.wy-dropdown-menu>dd.divider{border-top:1px solid #cfd7dd;margin:6px 0}.wy-dropdown-menu>dd.search{padding-bottom:12px}.wy-dropdown-menu>dd.search input[type=search]{width:100%}.wy-dropdown-menu>dd.call-to-action{background:#e3e3e3;text-transform:uppercase;font-weight:500;font-size:80%}.wy-dropdown-menu>dd.call-to-action:hover{background:#e3e3e3}.wy-dropdown-menu>dd.call-to-action .btn{color:#fff}.wy-dropdown.wy-dropdown-up .wy-dropdown-menu{bottom:100%;top:auto;left:auto;right:0}.wy-dropdown.wy-dropdown-bubble .wy-dropdown-menu{background:#fcfcfc;margin-top:2px}.wy-dropdown.wy-dropdown-bubble .wy-dropdown-menu a{padding:6px 12px}.wy-dropdown.wy-dropdown-bubble .wy-dropdown-menu a:hover{background:#2980b9;color:#fff}.wy-dropdown.wy-dropdown-left .wy-dropdown-menu{right:0;left:auto;text-align:right}.wy-dropdown-arrow:before{content:" ";border-bottom:5px solid #f5f5f5;border-left:5px solid transparent;border-right:5px solid transparent;position:absolute;display:block;top:-4px;left:50%;margin-left:-3px}.wy-dropdown-arrow.wy-dropdown-arrow-left:before{left:11px}.wy-form-stacked select{display:block}.wy-form-aligned .wy-help-inline,.wy-form-aligned input,.wy-form-aligned label,.wy-form-aligned select,.wy-form-aligned textarea{display:inline-block;*display:inline;*zoom:1;vertical-align:middle}.wy-form-aligned .wy-control-group>label{display:inline-block;vertical-align:middle;width:10em;margin:6px 12px 0 0;float:left}.wy-form-aligned .wy-control{float:left}.wy-form-aligned .wy-control label{display:block}.wy-form-aligned .wy-control select{margin-top:6px}fieldset{margin:0}fieldset,legend{border:0;padding:0}legend{width:100%;white-space:normal;margin-bottom:24px;font-size:150%;*margin-left:-7px}label,legend{display:block}label{margin:0 0 .3125em;color:#333;font-size:90%}input,select,textarea{font-size:100%;margin:0;vertical-align:baseline;*vertical-align:middle}.wy-control-group{margin-bottom:24px;max-width:1200px;margin-left:auto;margin-right:auto;*zoom:1}.wy-control-group:after,.wy-control-group:before{display:table;content:""}.wy-control-group:after{clear:both}.wy-control-group.wy-control-group-required>label:after{content:" *";color:#e74c3c}.wy-control-group .wy-form-full,.wy-control-group .wy-form-halves,.wy-control-group .wy-form-thirds{padding-bottom:12px}.wy-control-group .wy-form-full input[type=color],.wy-control-group .wy-form-full input[type=date],.wy-control-group .wy-form-full input[type=datetime-local],.wy-control-group .wy-form-full input[type=datetime],.wy-control-group .wy-form-full input[type=email],.wy-control-group .wy-form-full input[type=month],.wy-control-group .wy-form-full input[type=number],.wy-control-group .wy-form-full input[type=password],.wy-control-group .wy-form-full input[type=search],.wy-control-group .wy-form-full input[type=tel],.wy-control-group .wy-form-full input[type=text],.wy-control-group .wy-form-full input[type=time],.wy-control-group .wy-form-full input[type=url],.wy-control-group .wy-form-full input[type=week],.wy-control-group .wy-form-full select,.wy-control-group .wy-form-halves input[type=color],.wy-control-group .wy-form-halves input[type=date],.wy-control-group .wy-form-halves input[type=datetime-local],.wy-control-group .wy-form-halves input[type=datetime],.wy-control-group .wy-form-halves input[type=email],.wy-control-group .wy-form-halves input[type=month],.wy-control-group .wy-form-halves input[type=number],.wy-control-group .wy-form-halves input[type=password],.wy-control-group .wy-form-halves input[type=search],.wy-control-group .wy-form-halves input[type=tel],.wy-control-group .wy-form-halves input[type=text],.wy-control-group .wy-form-halves input[type=time],.wy-control-group .wy-form-halves input[type=url],.wy-control-group .wy-form-halves input[type=week],.wy-control-group .wy-form-halves select,.wy-control-group .wy-form-thirds input[type=color],.wy-control-group .wy-form-thirds input[type=date],.wy-control-group .wy-form-thirds input[type=datetime-local],.wy-control-group .wy-form-thirds input[type=datetime],.wy-control-group .wy-form-thirds input[type=email],.wy-control-group .wy-form-thirds input[type=month],.wy-control-group .wy-form-thirds input[type=number],.wy-control-group .wy-form-thirds input[type=password],.wy-control-group .wy-form-thirds input[type=search],.wy-control-group .wy-form-thirds input[type=tel],.wy-control-group .wy-form-thirds input[type=text],.wy-control-group .wy-form-thirds input[type=time],.wy-control-group .wy-form-thirds input[type=url],.wy-control-group .wy-form-thirds input[type=week],.wy-control-group .wy-form-thirds select{width:100%}.wy-control-group .wy-form-full{float:left;display:block;width:100%;margin-right:0}.wy-control-group .wy-form-full:last-child{margin-right:0}.wy-control-group .wy-form-halves{float:left;display:block;margin-right:2.35765%;width:48.82117%}.wy-control-group .wy-form-halves:last-child,.wy-control-group .wy-form-halves:nth-of-type(2n){margin-right:0}.wy-control-group .wy-form-halves:nth-of-type(odd){clear:left}.wy-control-group .wy-form-thirds{float:left;display:block;margin-right:2.35765%;width:31.76157%}.wy-control-group .wy-form-thirds:last-child,.wy-control-group .wy-form-thirds:nth-of-type(3n){margin-right:0}.wy-control-group .wy-form-thirds:nth-of-type(3n+1){clear:left}.wy-control-group.wy-control-group-no-input .wy-control,.wy-control-no-input{margin:6px 0 0;font-size:90%}.wy-control-no-input{display:inline-block}.wy-control-group.fluid-input input[type=color],.wy-control-group.fluid-input input[type=date],.wy-control-group.fluid-input input[type=datetime-local],.wy-control-group.fluid-input input[type=datetime],.wy-control-group.fluid-input input[type=email],.wy-control-group.fluid-input input[type=month],.wy-control-group.fluid-input input[type=number],.wy-control-group.fluid-input input[type=password],.wy-control-group.fluid-input input[type=search],.wy-control-group.fluid-input input[type=tel],.wy-control-group.fluid-input input[type=text],.wy-control-group.fluid-input input[type=time],.wy-control-group.fluid-input input[type=url],.wy-control-group.fluid-input input[type=week]{width:100%}.wy-form-message-inline{padding-left:.3em;color:#666;font-size:90%}.wy-form-message{display:block;color:#999;font-size:70%;margin-top:.3125em;font-style:italic}.wy-form-message p{font-size:inherit;font-style:italic;margin-bottom:6px}.wy-form-message p:last-child{margin-bottom:0}input{line-height:normal}input[type=button],input[type=reset],input[type=submit]{-webkit-appearance:button;cursor:pointer;font-family:Lato,proxima-nova,Helvetica Neue,Arial,sans-serif;*overflow:visible}input[type=color],input[type=date],input[type=datetime-local],input[type=datetime],input[type=email],input[type=month],input[type=number],input[type=password],input[type=search],input[type=tel],input[type=text],input[type=time],input[type=url],input[type=week]{-webkit-appearance:none;padding:6px;display:inline-block;border:1px solid #ccc;font-size:80%;font-family:Lato,proxima-nova,Helvetica Neue,Arial,sans-serif;box-shadow:inset 0 1px 3px #ddd;border-radius:0;-webkit-transition:border .3s linear;-moz-transition:border .3s linear;transition:border .3s linear}input[type=datetime-local]{padding:.34375em .625em}input[disabled]{cursor:default}input[type=checkbox],input[type=radio]{padding:0;margin-right:.3125em;*height:13px;*width:13px}input[type=checkbox],input[type=radio],input[type=search]{-webkit-box-sizing:border-box;-moz-box-sizing:border-box;box-sizing:border-box}input[type=search]::-webkit-search-cancel-button,input[type=search]::-webkit-search-decoration{-webkit-appearance:none}input[type=color]:focus,input[type=date]:focus,input[type=datetime-local]:focus,input[type=datetime]:focus,input[type=email]:focus,input[type=month]:focus,input[type=number]:focus,input[type=password]:focus,input[type=search]:focus,input[type=tel]:focus,input[type=text]:focus,input[type=time]:focus,input[type=url]:focus,input[type=week]:focus{outline:0;outline:thin dotted\9;border-color:#333}input.no-focus:focus{border-color:#ccc!important}input[type=checkbox]:focus,input[type=file]:focus,input[type=radio]:focus{outline:thin dotted #333;outline:1px auto #129fea}input[type=color][disabled],input[type=date][disabled],input[type=datetime-local][disabled],input[type=datetime][disabled],input[type=email][disabled],input[type=month][disabled],input[type=number][disabled],input[type=password][disabled],input[type=search][disabled],input[type=tel][disabled],input[type=text][disabled],input[type=time][disabled],input[type=url][disabled],input[type=week][disabled]{cursor:not-allowed;background-color:#fafafa}input:focus:invalid,select:focus:invalid,textarea:focus:invalid{color:#e74c3c;border:1px solid #e74c3c}input:focus:invalid:focus,select:focus:invalid:focus,textarea:focus:invalid:focus{border-color:#e74c3c}input[type=checkbox]:focus:invalid:focus,input[type=file]:focus:invalid:focus,input[type=radio]:focus:invalid:focus{outline-color:#e74c3c}input.wy-input-large{padding:12px;font-size:100%}textarea{overflow:auto;vertical-align:top;width:100%;font-family:Lato,proxima-nova,Helvetica Neue,Arial,sans-serif}select,textarea{padding:.5em .625em;display:inline-block;border:1px solid #ccc;font-size:80%;box-shadow:inset 0 1px 3px #ddd;-webkit-transition:border .3s linear;-moz-transition:border .3s linear;transition:border .3s linear}select{border:1px solid #ccc;background-color:#fff}select[multiple]{height:auto}select:focus,textarea:focus{outline:0}input[readonly],select[disabled],select[readonly],textarea[disabled],textarea[readonly]{cursor:not-allowed;background-color:#fafafa}input[type=checkbox][disabled],input[type=radio][disabled]{cursor:not-allowed}.wy-checkbox,.wy-radio{margin:6px 0;color:#404040;display:block}.wy-checkbox input,.wy-radio input{vertical-align:baseline}.wy-form-message-inline{display:inline-block;*display:inline;*zoom:1;vertical-align:middle}.wy-input-prefix,.wy-input-suffix{white-space:nowrap;padding:6px}.wy-input-prefix .wy-input-context,.wy-input-suffix .wy-input-context{line-height:27px;padding:0 8px;display:inline-block;font-size:80%;background-color:#f3f6f6;border:1px solid #ccc;color:#999}.wy-input-suffix .wy-input-context{border-left:0}.wy-input-prefix .wy-input-context{border-right:0}.wy-switch{position:relative;display:block;height:24px;margin-top:12px;cursor:pointer}.wy-switch:before{left:0;top:0;width:36px;height:12px;background:#ccc}.wy-switch:after,.wy-switch:before{position:absolute;content:"";display:block;border-radius:4px;-webkit-transition:all .2s ease-in-out;-moz-transition:all .2s ease-in-out;transition:all .2s ease-in-out}.wy-switch:after{width:18px;height:18px;background:#999;left:-3px;top:-3px}.wy-switch span{position:absolute;left:48px;display:block;font-size:12px;color:#ccc;line-height:1}.wy-switch.active:before{background:#1e8449}.wy-switch.active:after{left:24px;background:#27ae60}.wy-switch.disabled{cursor:not-allowed;opacity:.8}.wy-control-group.wy-control-group-error .wy-form-message,.wy-control-group.wy-control-group-error>label{color:#e74c3c}.wy-control-group.wy-control-group-error input[type=color],.wy-control-group.wy-control-group-error input[type=date],.wy-control-group.wy-control-group-error input[type=datetime-local],.wy-control-group.wy-control-group-error input[type=datetime],.wy-control-group.wy-control-group-error input[type=email],.wy-control-group.wy-control-group-error input[type=month],.wy-control-group.wy-control-group-error input[type=number],.wy-control-group.wy-control-group-error input[type=password],.wy-control-group.wy-control-group-error input[type=search],.wy-control-group.wy-control-group-error input[type=tel],.wy-control-group.wy-control-group-error input[type=text],.wy-control-group.wy-control-group-error input[type=time],.wy-control-group.wy-control-group-error input[type=url],.wy-control-group.wy-control-group-error input[type=week],.wy-control-group.wy-control-group-error textarea{border:1px solid #e74c3c}.wy-inline-validate{white-space:nowrap}.wy-inline-validate .wy-input-context{padding:.5em .625em;display:inline-block;font-size:80%}.wy-inline-validate.wy-inline-validate-success .wy-input-context{color:#27ae60}.wy-inline-validate.wy-inline-validate-danger .wy-input-context{color:#e74c3c}.wy-inline-validate.wy-inline-validate-warning .wy-input-context{color:#e67e22}.wy-inline-validate.wy-inline-validate-info .wy-input-context{color:#2980b9}.rotate-90{-webkit-transform:rotate(90deg);-moz-transform:rotate(90deg);-ms-transform:rotate(90deg);-o-transform:rotate(90deg);transform:rotate(90deg)}.rotate-180{-webkit-transform:rotate(180deg);-moz-transform:rotate(180deg);-ms-transform:rotate(180deg);-o-transform:rotate(180deg);transform:rotate(180deg)}.rotate-270{-webkit-transform:rotate(270deg);-moz-transform:rotate(270deg);-ms-transform:rotate(270deg);-o-transform:rotate(270deg);transform:rotate(270deg)}.mirror{-webkit-transform:scaleX(-1);-moz-transform:scaleX(-1);-ms-transform:scaleX(-1);-o-transform:scaleX(-1);transform:scaleX(-1)}.mirror.rotate-90{-webkit-transform:scaleX(-1) rotate(90deg);-moz-transform:scaleX(-1) rotate(90deg);-ms-transform:scaleX(-1) rotate(90deg);-o-transform:scaleX(-1) rotate(90deg);transform:scaleX(-1) rotate(90deg)}.mirror.rotate-180{-webkit-transform:scaleX(-1) rotate(180deg);-moz-transform:scaleX(-1) rotate(180deg);-ms-transform:scaleX(-1) rotate(180deg);-o-transform:scaleX(-1) rotate(180deg);transform:scaleX(-1) rotate(180deg)}.mirror.rotate-270{-webkit-transform:scaleX(-1) rotate(270deg);-moz-transform:scaleX(-1) rotate(270deg);-ms-transform:scaleX(-1) rotate(270deg);-o-transform:scaleX(-1) rotate(270deg);transform:scaleX(-1) rotate(270deg)}@media only screen and (max-width:480px){.wy-form button[type=submit]{margin:.7em 0 0}.wy-form input[type=color],.wy-form input[type=date],.wy-form input[type=datetime-local],.wy-form input[type=datetime],.wy-form input[type=email],.wy-form input[type=month],.wy-form input[type=number],.wy-form input[type=password],.wy-form input[type=search],.wy-form input[type=tel],.wy-form input[type=text],.wy-form input[type=time],.wy-form input[type=url],.wy-form input[type=week],.wy-form label{margin-bottom:.3em;display:block}.wy-form input[type=color],.wy-form input[type=date],.wy-form input[type=datetime-local],.wy-form input[type=datetime],.wy-form input[type=email],.wy-form input[type=month],.wy-form input[type=number],.wy-form input[type=password],.wy-form input[type=search],.wy-form input[type=tel],.wy-form input[type=time],.wy-form input[type=url],.wy-form input[type=week]{margin-bottom:0}.wy-form-aligned .wy-control-group label{margin-bottom:.3em;text-align:left;display:block;width:100%}.wy-form-aligned .wy-control{margin:1.5em 0 0}.wy-form-message,.wy-form-message-inline,.wy-form .wy-help-inline{display:block;font-size:80%;padding:6px 0}}@media screen and (max-width:768px){.tablet-hide{display:none}}@media screen and (max-width:480px){.mobile-hide{display:none}}.float-left{float:left}.float-right{float:right}.full-width{width:100%}.rst-content table.docutils,.rst-content table.field-list,.wy-table{border-collapse:collapse;border-spacing:0;empty-cells:show;margin-bottom:24px}.rst-content table.docutils caption,.rst-content table.field-list caption,.wy-table caption{color:#000;font:italic 85%/1 arial,sans-serif;padding:1em 0;text-align:center}.rst-content table.docutils td,.rst-content table.docutils th,.rst-content table.field-list td,.rst-content table.field-list th,.wy-table td,.wy-table th{font-size:90%;margin:0;overflow:visible;padding:8px 16px}.rst-content table.docutils td:first-child,.rst-content table.docutils th:first-child,.rst-content table.field-list td:first-child,.rst-content table.field-list th:first-child,.wy-table td:first-child,.wy-table th:first-child{border-left-width:0}.rst-content table.docutils thead,.rst-content table.field-list thead,.wy-table thead{color:#000;text-align:left;vertical-align:bottom;white-space:nowrap}.rst-content table.docutils thead th,.rst-content table.field-list thead th,.wy-table thead th{font-weight:700;border-bottom:2px solid #e1e4e5}.rst-content table.docutils td,.rst-content table.field-list td,.wy-table td{background-color:transparent;vertical-align:middle}.rst-content table.docutils td p,.rst-content table.field-list td p,.wy-table td p{line-height:18px}.rst-content table.docutils td p:last-child,.rst-content table.field-list td p:last-child,.wy-table td p:last-child{margin-bottom:0}.rst-content table.docutils .wy-table-cell-min,.rst-content table.field-list .wy-table-cell-min,.wy-table .wy-table-cell-min{width:1%;padding-right:0}.rst-content table.docutils .wy-table-cell-min input[type=checkbox],.rst-content table.field-list .wy-table-cell-min input[type=checkbox],.wy-table .wy-table-cell-min input[type=checkbox]{margin:0}.wy-table-secondary{color:grey;font-size:90%}.wy-table-tertiary{color:grey;font-size:80%}.rst-content table.docutils:not(.field-list) tr:nth-child(2n-1) td,.wy-table-backed,.wy-table-odd td,.wy-table-striped tr:nth-child(2n-1) td{background-color:#f3f6f6}.rst-content table.docutils,.wy-table-bordered-all{border:1px solid #e1e4e5}.rst-content table.docutils td,.wy-table-bordered-all td{border-bottom:1px solid #e1e4e5;border-left:1px solid #e1e4e5}.rst-content table.docutils tbody>tr:last-child td,.wy-table-bordered-all tbody>tr:last-child td{border-bottom-width:0}.wy-table-bordered{border:1px solid #e1e4e5}.wy-table-bordered-rows td{border-bottom:1px solid #e1e4e5}.wy-table-bordered-rows tbody>tr:last-child td{border-bottom-width:0}.wy-table-horizontal td,.wy-table-horizontal th{border-width:0 0 1px;border-bottom:1px solid #e1e4e5}.wy-table-horizontal tbody>tr:last-child td{border-bottom-width:0}.wy-table-responsive{margin-bottom:24px;max-width:100%;overflow:auto}.wy-table-responsive table{margin-bottom:0!important}.wy-table-responsive table td,.wy-table-responsive table th{white-space:nowrap}a{color:#2980b9;text-decoration:none;cursor:pointer}a:hover{color:#3091d1}a:visited{color:#9b59b6}html{height:100%}body,html{overflow-x:hidden}body{font-family:Lato,proxima-nova,Helvetica Neue,Arial,sans-serif;font-weight:400;color:#404040;min-height:100%;background:#edf0f2}.wy-text-left{text-align:left}.wy-text-center{text-align:center}.wy-text-right{text-align:right}.wy-text-large{font-size:120%}.wy-text-normal{font-size:100%}.wy-text-small,small{font-size:80%}.wy-text-strike{text-decoration:line-through}.wy-text-warning{color:#e67e22!important}a.wy-text-warning:hover{color:#eb9950!important}.wy-text-info{color:#2980b9!important}a.wy-text-info:hover{color:#409ad5!important}.wy-text-success{color:#27ae60!important}a.wy-text-success:hover{color:#36d278!important}.wy-text-danger{color:#e74c3c!important}a.wy-text-danger:hover{color:#ed7669!important}.wy-text-neutral{color:#404040!important}a.wy-text-neutral:hover{color:#595959!important}.rst-content .toctree-wrapper>p.caption,h1,h2,h3,h4,h5,h6,legend{margin-top:0;font-weight:700;font-family:Roboto Slab,ff-tisa-web-pro,Georgia,Arial,sans-serif}p{line-height:24px;font-size:16px;margin:0 0 24px}h1{font-size:175%}.rst-content .toctree-wrapper>p.caption,h2{font-size:150%}h3{font-size:125%}h4{font-size:115%}h5{font-size:110%}h6{font-size:100%}hr{display:block;height:1px;border:0;border-top:1px solid #e1e4e5;margin:24px 0;padding:0}.rst-content code,.rst-content tt,code{white-space:nowrap;max-width:100%;background:#fff;border:1px solid #e1e4e5;font-size:75%;padding:0 5px;font-family:SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,Courier,monospace;color:#e74c3c;overflow-x:auto}.rst-content tt.code-large,code.code-large{font-size:90%}.rst-content .section ul,.rst-content .toctree-wrapper ul,.rst-content section ul,.wy-plain-list-disc,article ul{list-style:disc;line-height:24px;margin-bottom:24px}.rst-content .section ul li,.rst-content .toctree-wrapper ul li,.rst-content section ul li,.wy-plain-list-disc li,article ul li{list-style:disc;margin-left:24px}.rst-content .section ul li p:last-child,.rst-content .section ul li ul,.rst-content .toctree-wrapper ul li p:last-child,.rst-content .toctree-wrapper ul li ul,.rst-content section ul li p:last-child,.rst-content section ul li ul,.wy-plain-list-disc li p:last-child,.wy-plain-list-disc li ul,article ul li p:last-child,article ul li ul{margin-bottom:0}.rst-content .section ul li li,.rst-content .toctree-wrapper ul li li,.rst-content section ul li li,.wy-plain-list-disc li li,article ul li li{list-style:circle}.rst-content .section ul li li li,.rst-content .toctree-wrapper ul li li li,.rst-content section ul li li li,.wy-plain-list-disc li li li,article ul li li li{list-style:square}.rst-content .section ul li ol li,.rst-content .toctree-wrapper ul li ol li,.rst-content section ul li ol li,.wy-plain-list-disc li ol li,article ul li ol li{list-style:decimal}.rst-content .section ol,.rst-content .section ol.arabic,.rst-content .toctree-wrapper ol,.rst-content .toctree-wrapper ol.arabic,.rst-content section ol,.rst-content section ol.arabic,.wy-plain-list-decimal,article ol{list-style:decimal;line-height:24px;margin-bottom:24px}.rst-content .section ol.arabic li,.rst-content .section ol li,.rst-content .toctree-wrapper ol.arabic li,.rst-content .toctree-wrapper ol li,.rst-content section ol.arabic li,.rst-content section ol li,.wy-plain-list-decimal li,article ol li{list-style:decimal;margin-left:24px}.rst-content .section ol.arabic li ul,.rst-content .section ol li p:last-child,.rst-content .section ol li ul,.rst-content .toctree-wrapper ol.arabic li ul,.rst-content .toctree-wrapper ol li p:last-child,.rst-content .toctree-wrapper ol li ul,.rst-content section ol.arabic li ul,.rst-content section ol li p:last-child,.rst-content section ol li ul,.wy-plain-list-decimal li p:last-child,.wy-plain-list-decimal li ul,article ol li p:last-child,article ol li ul{margin-bottom:0}.rst-content .section ol.arabic li ul li,.rst-content .section ol li ul li,.rst-content .toctree-wrapper ol.arabic li ul li,.rst-content .toctree-wrapper ol li ul li,.rst-content section ol.arabic li ul li,.rst-content section ol li ul li,.wy-plain-list-decimal li ul li,article ol li ul li{list-style:disc}.wy-breadcrumbs{*zoom:1}.wy-breadcrumbs:after,.wy-breadcrumbs:before{display:table;content:""}.wy-breadcrumbs:after{clear:both}.wy-breadcrumbs>li{display:inline-block;padding-top:5px}.wy-breadcrumbs>li.wy-breadcrumbs-aside{float:right}.rst-content .wy-breadcrumbs>li code,.rst-content .wy-breadcrumbs>li tt,.wy-breadcrumbs>li .rst-content tt,.wy-breadcrumbs>li code{all:inherit;color:inherit}.breadcrumb-item:before{content:"/";color:#bbb;font-size:13px;padding:0 6px 0 3px}.wy-breadcrumbs-extra{margin-bottom:0;color:#b3b3b3;font-size:80%;display:inline-block}@media screen and (max-width:480px){.wy-breadcrumbs-extra,.wy-breadcrumbs li.wy-breadcrumbs-aside{display:none}}@media print{.wy-breadcrumbs li.wy-breadcrumbs-aside{display:none}}html{font-size:16px}.wy-affix{position:fixed;top:1.618em}.wy-menu a:hover{text-decoration:none}.wy-menu-horiz{*zoom:1}.wy-menu-horiz:after,.wy-menu-horiz:before{display:table;content:""}.wy-menu-horiz:after{clear:both}.wy-menu-horiz li,.wy-menu-horiz ul{display:inline-block}.wy-menu-horiz li:hover{background:hsla(0,0%,100%,.1)}.wy-menu-horiz li.divide-left{border-left:1px solid #404040}.wy-menu-horiz li.divide-right{border-right:1px solid #404040}.wy-menu-horiz a{height:32px;display:inline-block;line-height:32px;padding:0 16px}.wy-menu-vertical{width:300px}.wy-menu-vertical header,.wy-menu-vertical p.caption{color:#55a5d9;height:32px;line-height:32px;padding:0 1.618em;margin:12px 0 0;display:block;font-weight:700;text-transform:uppercase;font-size:85%;white-space:nowrap}.wy-menu-vertical ul{margin-bottom:0}.wy-menu-vertical li.divide-top{border-top:1px solid #404040}.wy-menu-vertical li.divide-bottom{border-bottom:1px solid #404040}.wy-menu-vertical li.current{background:#e3e3e3}.wy-menu-vertical li.current a{color:grey;border-right:1px solid #c9c9c9;padding:.4045em 2.427em}.wy-menu-vertical li.current a:hover{background:#d6d6d6}.rst-content .wy-menu-vertical li tt,.wy-menu-vertical li .rst-content tt,.wy-menu-vertical li code{border:none;background:inherit;color:inherit;padding-left:0;padding-right:0}.wy-menu-vertical li button.toctree-expand{display:block;float:left;margin-left:-1.2em;line-height:18px;color:#4d4d4d;border:none;background:none;padding:0}.wy-menu-vertical li.current>a,.wy-menu-vertical li.on a{color:#404040;font-weight:700;position:relative;background:#fcfcfc;border:none;padding:.4045em 1.618em}.wy-menu-vertical li.current>a:hover,.wy-menu-vertical li.on a:hover{background:#fcfcfc}.wy-menu-vertical li.current>a:hover button.toctree-expand,.wy-menu-vertical li.on a:hover button.toctree-expand{color:grey}.wy-menu-vertical li.current>a button.toctree-expand,.wy-menu-vertical li.on a button.toctree-expand{display:block;line-height:18px;color:#333}.wy-menu-vertical li.toctree-l1.current>a{border-bottom:1px solid #c9c9c9;border-top:1px solid #c9c9c9}.wy-menu-vertical .toctree-l1.current .toctree-l2>ul,.wy-menu-vertical .toctree-l2.current .toctree-l3>ul,.wy-menu-vertical .toctree-l3.current .toctree-l4>ul,.wy-menu-vertical .toctree-l4.current .toctree-l5>ul,.wy-menu-vertical .toctree-l5.current .toctree-l6>ul,.wy-menu-vertical .toctree-l6.current .toctree-l7>ul,.wy-menu-vertical .toctree-l7.current .toctree-l8>ul,.wy-menu-vertical .toctree-l8.current .toctree-l9>ul,.wy-menu-vertical .toctree-l9.current .toctree-l10>ul,.wy-menu-vertical .toctree-l10.current .toctree-l11>ul{display:none}.wy-menu-vertical .toctree-l1.current .current.toctree-l2>ul,.wy-menu-vertical .toctree-l2.current .current.toctree-l3>ul,.wy-menu-vertical .toctree-l3.current .current.toctree-l4>ul,.wy-menu-vertical .toctree-l4.current .current.toctree-l5>ul,.wy-menu-vertical .toctree-l5.current .current.toctree-l6>ul,.wy-menu-vertical .toctree-l6.current .current.toctree-l7>ul,.wy-menu-vertical .toctree-l7.current .current.toctree-l8>ul,.wy-menu-vertical .toctree-l8.current .current.toctree-l9>ul,.wy-menu-vertical .toctree-l9.current .current.toctree-l10>ul,.wy-menu-vertical .toctree-l10.current .current.toctree-l11>ul{display:block}.wy-menu-vertical li.toctree-l3,.wy-menu-vertical li.toctree-l4{font-size:.9em}.wy-menu-vertical li.toctree-l2 a,.wy-menu-vertical li.toctree-l3 a,.wy-menu-vertical li.toctree-l4 a,.wy-menu-vertical li.toctree-l5 a,.wy-menu-vertical li.toctree-l6 a,.wy-menu-vertical li.toctree-l7 a,.wy-menu-vertical li.toctree-l8 a,.wy-menu-vertical li.toctree-l9 a,.wy-menu-vertical li.toctree-l10 a{color:#404040}.wy-menu-vertical li.toctree-l2 a:hover button.toctree-expand,.wy-menu-vertical li.toctree-l3 a:hover button.toctree-expand,.wy-menu-vertical li.toctree-l4 a:hover button.toctree-expand,.wy-menu-vertical li.toctree-l5 a:hover button.toctree-expand,.wy-menu-vertical li.toctree-l6 a:hover button.toctree-expand,.wy-menu-vertical li.toctree-l7 a:hover button.toctree-expand,.wy-menu-vertical li.toctree-l8 a:hover button.toctree-expand,.wy-menu-vertical li.toctree-l9 a:hover button.toctree-expand,.wy-menu-vertical li.toctree-l10 a:hover button.toctree-expand{color:grey}.wy-menu-vertical li.toctree-l2.current li.toctree-l3>a,.wy-menu-vertical li.toctree-l3.current li.toctree-l4>a,.wy-menu-vertical li.toctree-l4.current li.toctree-l5>a,.wy-menu-vertical li.toctree-l5.current li.toctree-l6>a,.wy-menu-vertical li.toctree-l6.current li.toctree-l7>a,.wy-menu-vertical li.toctree-l7.current li.toctree-l8>a,.wy-menu-vertical li.toctree-l8.current li.toctree-l9>a,.wy-menu-vertical li.toctree-l9.current li.toctree-l10>a,.wy-menu-vertical li.toctree-l10.current li.toctree-l11>a{display:block}.wy-menu-vertical li.toctree-l2.current>a{padding:.4045em 2.427em}.wy-menu-vertical li.toctree-l2.current li.toctree-l3>a{padding:.4045em 1.618em .4045em 4.045em}.wy-menu-vertical li.toctree-l3.current>a{padding:.4045em 4.045em}.wy-menu-vertical li.toctree-l3.current li.toctree-l4>a{padding:.4045em 1.618em .4045em 5.663em}.wy-menu-vertical li.toctree-l4.current>a{padding:.4045em 5.663em}.wy-menu-vertical li.toctree-l4.current li.toctree-l5>a{padding:.4045em 1.618em .4045em 7.281em}.wy-menu-vertical li.toctree-l5.current>a{padding:.4045em 7.281em}.wy-menu-vertical li.toctree-l5.current li.toctree-l6>a{padding:.4045em 1.618em .4045em 8.899em}.wy-menu-vertical li.toctree-l6.current>a{padding:.4045em 8.899em}.wy-menu-vertical li.toctree-l6.current li.toctree-l7>a{padding:.4045em 1.618em .4045em 10.517em}.wy-menu-vertical li.toctree-l7.current>a{padding:.4045em 10.517em}.wy-menu-vertical li.toctree-l7.current li.toctree-l8>a{padding:.4045em 1.618em .4045em 12.135em}.wy-menu-vertical li.toctree-l8.current>a{padding:.4045em 12.135em}.wy-menu-vertical li.toctree-l8.current li.toctree-l9>a{padding:.4045em 1.618em .4045em 13.753em}.wy-menu-vertical li.toctree-l9.current>a{padding:.4045em 13.753em}.wy-menu-vertical li.toctree-l9.current li.toctree-l10>a{padding:.4045em 1.618em .4045em 15.371em}.wy-menu-vertical li.toctree-l10.current>a{padding:.4045em 15.371em}.wy-menu-vertical li.toctree-l10.current li.toctree-l11>a{padding:.4045em 1.618em .4045em 16.989em}.wy-menu-vertical li.toctree-l2.current>a,.wy-menu-vertical li.toctree-l2.current li.toctree-l3>a{background:#c9c9c9}.wy-menu-vertical li.toctree-l2 button.toctree-expand{color:#a3a3a3}.wy-menu-vertical li.toctree-l3.current>a,.wy-menu-vertical li.toctree-l3.current li.toctree-l4>a{background:#bdbdbd}.wy-menu-vertical li.toctree-l3 button.toctree-expand{color:#969696}.wy-menu-vertical li.current ul{display:block}.wy-menu-vertical li ul{margin-bottom:0;display:none}.wy-menu-vertical li ul li a{margin-bottom:0;color:#d9d9d9;font-weight:400}.wy-menu-vertical a{line-height:18px;padding:.4045em 1.618em;display:block;position:relative;font-size:90%;color:#d9d9d9}.wy-menu-vertical a:hover{background-color:#4e4a4a;cursor:pointer}.wy-menu-vertical a:hover button.toctree-expand{color:#d9d9d9}.wy-menu-vertical a:active{background-color:#2980b9;cursor:pointer;color:#fff}.wy-menu-vertical a:active button.toctree-expand{color:#fff}.wy-side-nav-search{display:block;width:300px;padding:.809em;margin-bottom:.809em;z-index:200;background-color:#2980b9;text-align:center;color:#fcfcfc}.wy-side-nav-search input[type=text]{width:100%;border-radius:50px;padding:6px 12px;border-color:#2472a4}.wy-side-nav-search img{display:block;margin:auto auto .809em;height:45px;width:45px;background-color:#2980b9;padding:5px;border-radius:100%}.wy-side-nav-search .wy-dropdown>a,.wy-side-nav-search>a{color:#fcfcfc;font-size:100%;font-weight:700;display:inline-block;padding:4px 6px;margin-bottom:.809em;max-width:100%}.wy-side-nav-search .wy-dropdown>a:hover,.wy-side-nav-search>a:hover{background:hsla(0,0%,100%,.1)}.wy-side-nav-search .wy-dropdown>a img.logo,.wy-side-nav-search>a img.logo{display:block;margin:0 auto;height:auto;width:auto;border-radius:0;max-width:100%;background:transparent}.wy-side-nav-search .wy-dropdown>a.icon img.logo,.wy-side-nav-search>a.icon img.logo{margin-top:.85em}.wy-side-nav-search>div.version{margin-top:-.4045em;margin-bottom:.809em;font-weight:400;color:hsla(0,0%,100%,.3)}.wy-nav .wy-menu-vertical header{color:#2980b9}.wy-nav .wy-menu-vertical a{color:#b3b3b3}.wy-nav .wy-menu-vertical a:hover{background-color:#2980b9;color:#fff}[data-menu-wrap]{-webkit-transition:all .2s ease-in;-moz-transition:all .2s ease-in;transition:all .2s ease-in;position:absolute;opacity:1;width:100%;opacity:0}[data-menu-wrap].move-center{left:0;right:auto;opacity:1}[data-menu-wrap].move-left{right:auto;left:-100%;opacity:0}[data-menu-wrap].move-right{right:-100%;left:auto;opacity:0}.wy-body-for-nav{background:#fcfcfc}.wy-grid-for-nav{position:absolute;width:100%;height:100%}.wy-nav-side{position:fixed;top:0;bottom:0;left:0;padding-bottom:2em;width:300px;overflow-x:hidden;overflow-y:hidden;min-height:100%;color:#9b9b9b;background:#343131;z-index:200}.wy-side-scroll{width:320px;position:relative;overflow-x:hidden;overflow-y:scroll;height:100%}.wy-nav-top{display:none;background:#2980b9;color:#fff;padding:.4045em .809em;position:relative;line-height:50px;text-align:center;font-size:100%;*zoom:1}.wy-nav-top:after,.wy-nav-top:before{display:table;content:""}.wy-nav-top:after{clear:both}.wy-nav-top a{color:#fff;font-weight:700}.wy-nav-top img{margin-right:12px;height:45px;width:45px;background-color:#2980b9;padding:5px;border-radius:100%}.wy-nav-top i{font-size:30px;float:left;cursor:pointer;padding-top:inherit}.wy-nav-content-wrap{margin-left:300px;background:#fcfcfc;min-height:100%}.wy-nav-content{padding:1.618em 3.236em;height:100%;max-width:800px;margin:auto}.wy-body-mask{position:fixed;width:100%;height:100%;background:rgba(0,0,0,.2);display:none;z-index:499}.wy-body-mask.on{display:block}footer{color:grey}footer p{margin-bottom:12px}.rst-content footer span.commit tt,footer span.commit .rst-content tt,footer span.commit code{padding:0;font-family:SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,Courier,monospace;font-size:1em;background:none;border:none;color:grey}.rst-footer-buttons{*zoom:1}.rst-footer-buttons:after,.rst-footer-buttons:before{width:100%;display:table;content:""}.rst-footer-buttons:after{clear:both}.rst-breadcrumbs-buttons{margin-top:12px;*zoom:1}.rst-breadcrumbs-buttons:after,.rst-breadcrumbs-buttons:before{display:table;content:""}.rst-breadcrumbs-buttons:after{clear:both}#search-results .search li{margin-bottom:24px;border-bottom:1px solid #e1e4e5;padding-bottom:24px}#search-results .search li:first-child{border-top:1px solid #e1e4e5;padding-top:24px}#search-results .search li a{font-size:120%;margin-bottom:12px;display:inline-block}#search-results .context{color:grey;font-size:90%}.genindextable li>ul{margin-left:24px}@media screen and (max-width:768px){.wy-body-for-nav{background:#fcfcfc}.wy-nav-top{display:block}.wy-nav-side{left:-300px}.wy-nav-side.shift{width:85%;left:0}.wy-menu.wy-menu-vertical,.wy-side-nav-search,.wy-side-scroll{width:auto}.wy-nav-content-wrap{margin-left:0}.wy-nav-content-wrap .wy-nav-content{padding:1.618em}.wy-nav-content-wrap.shift{position:fixed;min-width:100%;left:85%;top:0;height:100%;overflow:hidden}}@media screen and (min-width:1100px){.wy-nav-content-wrap{background:rgba(0,0,0,.05)}.wy-nav-content{margin:0;background:#fcfcfc}}@media print{.rst-versions,.wy-nav-side,footer{display:none}.wy-nav-content-wrap{margin-left:0}}.rst-versions{position:fixed;bottom:0;left:0;width:300px;color:#fcfcfc;background:#1f1d1d;font-family:Lato,proxima-nova,Helvetica Neue,Arial,sans-serif;z-index:400}.rst-versions a{color:#2980b9;text-decoration:none}.rst-versions .rst-badge-small{display:none}.rst-versions .rst-current-version{padding:12px;background-color:#272525;display:block;text-align:right;font-size:90%;cursor:pointer;color:#27ae60;*zoom:1}.rst-versions .rst-current-version:after,.rst-versions .rst-current-version:before{display:table;content:""}.rst-versions .rst-current-version:after{clear:both}.rst-content .code-block-caption .rst-versions .rst-current-version .headerlink,.rst-content .eqno .rst-versions .rst-current-version .headerlink,.rst-content .rst-versions .rst-current-version .admonition-title,.rst-content code.download .rst-versions .rst-current-version span:first-child,.rst-content dl dt .rst-versions .rst-current-version .headerlink,.rst-content h1 .rst-versions .rst-current-version .headerlink,.rst-content h2 .rst-versions .rst-current-version .headerlink,.rst-content h3 .rst-versions .rst-current-version .headerlink,.rst-content h4 .rst-versions .rst-current-version .headerlink,.rst-content h5 .rst-versions .rst-current-version .headerlink,.rst-content h6 .rst-versions .rst-current-version .headerlink,.rst-content p .rst-versions .rst-current-version .headerlink,.rst-content table>caption .rst-versions .rst-current-version .headerlink,.rst-content tt.download .rst-versions .rst-current-version span:first-child,.rst-versions .rst-current-version .fa,.rst-versions .rst-current-version .icon,.rst-versions .rst-current-version .rst-content .admonition-title,.rst-versions .rst-current-version .rst-content .code-block-caption .headerlink,.rst-versions .rst-current-version .rst-content .eqno .headerlink,.rst-versions .rst-current-version .rst-content code.download span:first-child,.rst-versions .rst-current-version .rst-content dl dt .headerlink,.rst-versions .rst-current-version .rst-content h1 .headerlink,.rst-versions .rst-current-version .rst-content h2 .headerlink,.rst-versions .rst-current-version .rst-content h3 .headerlink,.rst-versions .rst-current-version .rst-content h4 .headerlink,.rst-versions .rst-current-version .rst-content h5 .headerlink,.rst-versions .rst-current-version .rst-content h6 .headerlink,.rst-versions .rst-current-version .rst-content p .headerlink,.rst-versions .rst-current-version .rst-content table>caption .headerlink,.rst-versions .rst-current-version .rst-content tt.download span:first-child,.rst-versions .rst-current-version .wy-menu-vertical li button.toctree-expand,.wy-menu-vertical li .rst-versions .rst-current-version button.toctree-expand{color:#fcfcfc}.rst-versions .rst-current-version .fa-book,.rst-versions .rst-current-version .icon-book{float:left}.rst-versions .rst-current-version.rst-out-of-date{background-color:#e74c3c;color:#fff}.rst-versions .rst-current-version.rst-active-old-version{background-color:#f1c40f;color:#000}.rst-versions.shift-up{height:auto;max-height:100%;overflow-y:scroll}.rst-versions.shift-up .rst-other-versions{display:block}.rst-versions .rst-other-versions{font-size:90%;padding:12px;color:grey;display:none}.rst-versions .rst-other-versions hr{display:block;height:1px;border:0;margin:20px 0;padding:0;border-top:1px solid #413d3d}.rst-versions .rst-other-versions dd{display:inline-block;margin:0}.rst-versions .rst-other-versions dd a{display:inline-block;padding:6px;color:#fcfcfc}.rst-versions.rst-badge{width:auto;bottom:20px;right:20px;left:auto;border:none;max-width:300px;max-height:90%}.rst-versions.rst-badge .fa-book,.rst-versions.rst-badge .icon-book{float:none;line-height:30px}.rst-versions.rst-badge.shift-up .rst-current-version{text-align:right}.rst-versions.rst-badge.shift-up .rst-current-version .fa-book,.rst-versions.rst-badge.shift-up .rst-current-version .icon-book{float:left}.rst-versions.rst-badge>.rst-current-version{width:auto;height:30px;line-height:30px;padding:0 6px;display:block;text-align:center}@media screen and (max-width:768px){.rst-versions{width:85%;display:none}.rst-versions.shift{display:block}}.rst-content .toctree-wrapper>p.caption,.rst-content h1,.rst-content h2,.rst-content h3,.rst-content h4,.rst-content h5,.rst-content h6{margin-bottom:24px}.rst-content img{max-width:100%;height:auto}.rst-content div.figure,.rst-content figure{margin-bottom:24px}.rst-content div.figure .caption-text,.rst-content figure .caption-text{font-style:italic}.rst-content div.figure p:last-child.caption,.rst-content figure p:last-child.caption{margin-bottom:0}.rst-content div.figure.align-center,.rst-content figure.align-center{text-align:center}.rst-content .section>a>img,.rst-content .section>img,.rst-content section>a>img,.rst-content section>img{margin-bottom:24px}.rst-content abbr[title]{text-decoration:none}.rst-content.style-external-links a.reference.external:after{font-family:FontAwesome;content:"\f08e";color:#b3b3b3;vertical-align:super;font-size:60%;margin:0 .2em}.rst-content blockquote{margin-left:24px;line-height:24px;margin-bottom:24px}.rst-content pre.literal-block{white-space:pre;margin:0;padding:12px;font-family:SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,Courier,monospace;display:block;overflow:auto}.rst-content div[class^=highlight],.rst-content pre.literal-block{border:1px solid #e1e4e5;overflow-x:auto;margin:1px 0 24px}.rst-content div[class^=highlight] div[class^=highlight],.rst-content pre.literal-block div[class^=highlight]{padding:0;border:none;margin:0}.rst-content div[class^=highlight] td.code{width:100%}.rst-content .linenodiv pre{border-right:1px solid #e6e9ea;margin:0;padding:12px;font-family:SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,Courier,monospace;user-select:none;pointer-events:none}.rst-content div[class^=highlight] pre{white-space:pre;margin:0;padding:12px;display:block;overflow:auto}.rst-content div[class^=highlight] pre .hll{display:block;margin:0 -12px;padding:0 12px}.rst-content .linenodiv pre,.rst-content div[class^=highlight] pre,.rst-content pre.literal-block{font-family:SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,Courier,monospace;font-size:12px;line-height:1.4}.rst-content div.highlight .gp,.rst-content div.highlight span.linenos{user-select:none;pointer-events:none}.rst-content div.highlight span.linenos{display:inline-block;padding-left:0;padding-right:12px;margin-right:12px;border-right:1px solid #e6e9ea}.rst-content .code-block-caption{font-style:italic;font-size:85%;line-height:1;padding:1em 0;text-align:center}@media print{.rst-content .codeblock,.rst-content div[class^=highlight],.rst-content div[class^=highlight] pre{white-space:pre-wrap}}.rst-content .admonition,.rst-content .admonition-todo,.rst-content .attention,.rst-content .caution,.rst-content .danger,.rst-content .error,.rst-content .hint,.rst-content .important,.rst-content .note,.rst-content .seealso,.rst-content .tip,.rst-content .warning{clear:both}.rst-content .admonition-todo .last,.rst-content .admonition-todo>:last-child,.rst-content .admonition .last,.rst-content .admonition>:last-child,.rst-content .attention .last,.rst-content .attention>:last-child,.rst-content .caution .last,.rst-content .caution>:last-child,.rst-content .danger .last,.rst-content .danger>:last-child,.rst-content .error .last,.rst-content .error>:last-child,.rst-content .hint .last,.rst-content .hint>:last-child,.rst-content .important .last,.rst-content .important>:last-child,.rst-content .note .last,.rst-content .note>:last-child,.rst-content .seealso .last,.rst-content .seealso>:last-child,.rst-content .tip .last,.rst-content .tip>:last-child,.rst-content .warning .last,.rst-content .warning>:last-child{margin-bottom:0}.rst-content .admonition-title:before{margin-right:4px}.rst-content .admonition table{border-color:rgba(0,0,0,.1)}.rst-content .admonition table td,.rst-content .admonition table th{background:transparent!important;border-color:rgba(0,0,0,.1)!important}.rst-content .section ol.loweralpha,.rst-content .section ol.loweralpha>li,.rst-content .toctree-wrapper ol.loweralpha,.rst-content .toctree-wrapper ol.loweralpha>li,.rst-content section ol.loweralpha,.rst-content section ol.loweralpha>li{list-style:lower-alpha}.rst-content .section ol.upperalpha,.rst-content .section ol.upperalpha>li,.rst-content .toctree-wrapper ol.upperalpha,.rst-content .toctree-wrapper ol.upperalpha>li,.rst-content section ol.upperalpha,.rst-content section ol.upperalpha>li{list-style:upper-alpha}.rst-content .section ol li>*,.rst-content .section ul li>*,.rst-content .toctree-wrapper ol li>*,.rst-content .toctree-wrapper ul li>*,.rst-content section ol li>*,.rst-content section ul li>*{margin-top:12px;margin-bottom:12px}.rst-content .section ol li>:first-child,.rst-content .section ul li>:first-child,.rst-content .toctree-wrapper ol li>:first-child,.rst-content .toctree-wrapper ul li>:first-child,.rst-content section ol li>:first-child,.rst-content section ul li>:first-child{margin-top:0}.rst-content .section ol li>p,.rst-content .section ol li>p:last-child,.rst-content .section ul li>p,.rst-content .section ul li>p:last-child,.rst-content .toctree-wrapper ol li>p,.rst-content .toctree-wrapper ol li>p:last-child,.rst-content .toctree-wrapper ul li>p,.rst-content .toctree-wrapper ul li>p:last-child,.rst-content section ol li>p,.rst-content section ol li>p:last-child,.rst-content section ul li>p,.rst-content section ul li>p:last-child{margin-bottom:12px}.rst-content .section ol li>p:only-child,.rst-content .section ol li>p:only-child:last-child,.rst-content .section ul li>p:only-child,.rst-content .section ul li>p:only-child:last-child,.rst-content .toctree-wrapper ol li>p:only-child,.rst-content .toctree-wrapper ol li>p:only-child:last-child,.rst-content .toctree-wrapper ul li>p:only-child,.rst-content .toctree-wrapper ul li>p:only-child:last-child,.rst-content section ol li>p:only-child,.rst-content section ol li>p:only-child:last-child,.rst-content section ul li>p:only-child,.rst-content section ul li>p:only-child:last-child{margin-bottom:0}.rst-content .section ol li>ol,.rst-content .section ol li>ul,.rst-content .section ul li>ol,.rst-content .section ul li>ul,.rst-content .toctree-wrapper ol li>ol,.rst-content .toctree-wrapper ol li>ul,.rst-content .toctree-wrapper ul li>ol,.rst-content .toctree-wrapper ul li>ul,.rst-content section ol li>ol,.rst-content section ol li>ul,.rst-content section ul li>ol,.rst-content section ul li>ul{margin-bottom:12px}.rst-content .section ol.simple li>*,.rst-content .section ol.simple li ol,.rst-content .section ol.simple li ul,.rst-content .section ul.simple li>*,.rst-content .section ul.simple li ol,.rst-content .section ul.simple li ul,.rst-content .toctree-wrapper ol.simple li>*,.rst-content .toctree-wrapper ol.simple li ol,.rst-content .toctree-wrapper ol.simple li ul,.rst-content .toctree-wrapper ul.simple li>*,.rst-content .toctree-wrapper ul.simple li ol,.rst-content .toctree-wrapper ul.simple li ul,.rst-content section ol.simple li>*,.rst-content section ol.simple li ol,.rst-content section ol.simple li ul,.rst-content section ul.simple li>*,.rst-content section ul.simple li ol,.rst-content section ul.simple li ul{margin-top:0;margin-bottom:0}.rst-content .line-block{margin-left:0;margin-bottom:24px;line-height:24px}.rst-content .line-block .line-block{margin-left:24px;margin-bottom:0}.rst-content .topic-title{font-weight:700;margin-bottom:12px}.rst-content .toc-backref{color:#404040}.rst-content .align-right{float:right;margin:0 0 24px 24px}.rst-content .align-left{float:left;margin:0 24px 24px 0}.rst-content .align-center{margin:auto}.rst-content .align-center:not(table){display:block}.rst-content .code-block-caption .headerlink,.rst-content .eqno .headerlink,.rst-content .toctree-wrapper>p.caption .headerlink,.rst-content dl dt .headerlink,.rst-content h1 .headerlink,.rst-content h2 .headerlink,.rst-content h3 .headerlink,.rst-content h4 .headerlink,.rst-content h5 .headerlink,.rst-content h6 .headerlink,.rst-content p.caption .headerlink,.rst-content p .headerlink,.rst-content table>caption .headerlink{opacity:0;font-size:14px;font-family:FontAwesome;margin-left:.5em}.rst-content .code-block-caption .headerlink:focus,.rst-content .code-block-caption:hover .headerlink,.rst-content .eqno .headerlink:focus,.rst-content .eqno:hover .headerlink,.rst-content .toctree-wrapper>p.caption .headerlink:focus,.rst-content .toctree-wrapper>p.caption:hover .headerlink,.rst-content dl dt .headerlink:focus,.rst-content dl dt:hover .headerlink,.rst-content h1 .headerlink:focus,.rst-content h1:hover .headerlink,.rst-content h2 .headerlink:focus,.rst-content h2:hover .headerlink,.rst-content h3 .headerlink:focus,.rst-content h3:hover .headerlink,.rst-content h4 .headerlink:focus,.rst-content h4:hover .headerlink,.rst-content h5 .headerlink:focus,.rst-content h5:hover .headerlink,.rst-content h6 .headerlink:focus,.rst-content h6:hover .headerlink,.rst-content p.caption .headerlink:focus,.rst-content p.caption:hover .headerlink,.rst-content p .headerlink:focus,.rst-content p:hover .headerlink,.rst-content table>caption .headerlink:focus,.rst-content table>caption:hover .headerlink{opacity:1}.rst-content p a{overflow-wrap:anywhere}.rst-content .wy-table td p,.rst-content .wy-table td ul,.rst-content .wy-table th p,.rst-content .wy-table th ul,.rst-content table.docutils td p,.rst-content table.docutils td ul,.rst-content table.docutils th p,.rst-content table.docutils th ul,.rst-content table.field-list td p,.rst-content table.field-list td ul,.rst-content table.field-list th p,.rst-content table.field-list th ul{font-size:inherit}.rst-content .btn:focus{outline:2px solid}.rst-content table>caption .headerlink:after{font-size:12px}.rst-content .centered{text-align:center}.rst-content .sidebar{float:right;width:40%;display:block;margin:0 0 24px 24px;padding:24px;background:#f3f6f6;border:1px solid #e1e4e5}.rst-content .sidebar dl,.rst-content .sidebar p,.rst-content .sidebar ul{font-size:90%}.rst-content .sidebar .last,.rst-content .sidebar>:last-child{margin-bottom:0}.rst-content .sidebar .sidebar-title{display:block;font-family:Roboto Slab,ff-tisa-web-pro,Georgia,Arial,sans-serif;font-weight:700;background:#e1e4e5;padding:6px 12px;margin:-24px -24px 24px;font-size:100%}.rst-content .highlighted{background:#f1c40f;box-shadow:0 0 0 2px #f1c40f;display:inline;font-weight:700}.rst-content .citation-reference,.rst-content .footnote-reference{vertical-align:baseline;position:relative;top:-.4em;line-height:0;font-size:90%}.rst-content .citation-reference>span.fn-bracket,.rst-content .footnote-reference>span.fn-bracket{display:none}.rst-content .hlist{width:100%}.rst-content dl dt span.classifier:before{content:" : "}.rst-content dl dt span.classifier-delimiter{display:none!important}html.writer-html4 .rst-content table.docutils.citation,html.writer-html4 .rst-content table.docutils.footnote{background:none;border:none}html.writer-html4 .rst-content table.docutils.citation td,html.writer-html4 .rst-content table.docutils.citation tr,html.writer-html4 .rst-content table.docutils.footnote td,html.writer-html4 .rst-content table.docutils.footnote tr{border:none;background-color:transparent!important;white-space:normal}html.writer-html4 .rst-content table.docutils.citation td.label,html.writer-html4 .rst-content table.docutils.footnote td.label{padding-left:0;padding-right:0;vertical-align:top}html.writer-html5 .rst-content dl.citation,html.writer-html5 .rst-content dl.field-list,html.writer-html5 .rst-content dl.footnote{display:grid;grid-template-columns:auto minmax(80%,95%)}html.writer-html5 .rst-content dl.citation>dt,html.writer-html5 .rst-content dl.field-list>dt,html.writer-html5 .rst-content dl.footnote>dt{display:inline-grid;grid-template-columns:max-content auto}html.writer-html5 .rst-content aside.citation,html.writer-html5 .rst-content aside.footnote,html.writer-html5 .rst-content div.citation{display:grid;grid-template-columns:auto auto minmax(.65rem,auto) minmax(40%,95%)}html.writer-html5 .rst-content aside.citation>span.label,html.writer-html5 .rst-content aside.footnote>span.label,html.writer-html5 .rst-content div.citation>span.label{grid-column-start:1;grid-column-end:2}html.writer-html5 .rst-content aside.citation>span.backrefs,html.writer-html5 .rst-content aside.footnote>span.backrefs,html.writer-html5 .rst-content div.citation>span.backrefs{grid-column-start:2;grid-column-end:3;grid-row-start:1;grid-row-end:3}html.writer-html5 .rst-content aside.citation>p,html.writer-html5 .rst-content aside.footnote>p,html.writer-html5 .rst-content div.citation>p{grid-column-start:4;grid-column-end:5}html.writer-html5 .rst-content dl.citation,html.writer-html5 .rst-content dl.field-list,html.writer-html5 .rst-content dl.footnote{margin-bottom:24px}html.writer-html5 .rst-content dl.citation>dt,html.writer-html5 .rst-content dl.field-list>dt,html.writer-html5 .rst-content dl.footnote>dt{padding-left:1rem}html.writer-html5 .rst-content dl.citation>dd,html.writer-html5 .rst-content dl.citation>dt,html.writer-html5 .rst-content dl.field-list>dd,html.writer-html5 .rst-content dl.field-list>dt,html.writer-html5 .rst-content dl.footnote>dd,html.writer-html5 .rst-content dl.footnote>dt{margin-bottom:0}html.writer-html5 .rst-content dl.citation,html.writer-html5 .rst-content dl.footnote{font-size:.9rem}html.writer-html5 .rst-content dl.citation>dt,html.writer-html5 .rst-content dl.footnote>dt{margin:0 .5rem .5rem 0;line-height:1.2rem;word-break:break-all;font-weight:400}html.writer-html5 .rst-content dl.citation>dt>span.brackets:before,html.writer-html5 .rst-content dl.footnote>dt>span.brackets:before{content:"["}html.writer-html5 .rst-content dl.citation>dt>span.brackets:after,html.writer-html5 .rst-content dl.footnote>dt>span.brackets:after{content:"]"}html.writer-html5 .rst-content dl.citation>dt>span.fn-backref,html.writer-html5 .rst-content dl.footnote>dt>span.fn-backref{text-align:left;font-style:italic;margin-left:.65rem;word-break:break-word;word-spacing:-.1rem;max-width:5rem}html.writer-html5 .rst-content dl.citation>dt>span.fn-backref>a,html.writer-html5 .rst-content dl.footnote>dt>span.fn-backref>a{word-break:keep-all}html.writer-html5 .rst-content dl.citation>dt>span.fn-backref>a:not(:first-child):before,html.writer-html5 .rst-content dl.footnote>dt>span.fn-backref>a:not(:first-child):before{content:" "}html.writer-html5 .rst-content dl.citation>dd,html.writer-html5 .rst-content dl.footnote>dd{margin:0 0 .5rem;line-height:1.2rem}html.writer-html5 .rst-content dl.citation>dd p,html.writer-html5 .rst-content dl.footnote>dd p{font-size:.9rem}html.writer-html5 .rst-content aside.citation,html.writer-html5 .rst-content aside.footnote,html.writer-html5 .rst-content div.citation{padding-left:1rem;padding-right:1rem;font-size:.9rem;line-height:1.2rem}html.writer-html5 .rst-content aside.citation p,html.writer-html5 .rst-content aside.footnote p,html.writer-html5 .rst-content div.citation p{font-size:.9rem;line-height:1.2rem;margin-bottom:12px}html.writer-html5 .rst-content aside.citation span.backrefs,html.writer-html5 .rst-content aside.footnote span.backrefs,html.writer-html5 .rst-content div.citation span.backrefs{text-align:left;font-style:italic;margin-left:.65rem;word-break:break-word;word-spacing:-.1rem;max-width:5rem}html.writer-html5 .rst-content aside.citation span.backrefs>a,html.writer-html5 .rst-content aside.footnote span.backrefs>a,html.writer-html5 .rst-content div.citation span.backrefs>a{word-break:keep-all}html.writer-html5 .rst-content aside.citation span.backrefs>a:not(:first-child):before,html.writer-html5 .rst-content aside.footnote span.backrefs>a:not(:first-child):before,html.writer-html5 .rst-content div.citation span.backrefs>a:not(:first-child):before{content:" "}html.writer-html5 .rst-content aside.citation span.label,html.writer-html5 .rst-content aside.footnote span.label,html.writer-html5 .rst-content div.citation span.label{line-height:1.2rem}html.writer-html5 .rst-content aside.citation-list,html.writer-html5 .rst-content aside.footnote-list,html.writer-html5 .rst-content div.citation-list{margin-bottom:24px}html.writer-html5 .rst-content dl.option-list kbd{font-size:.9rem}.rst-content table.docutils.footnote,html.writer-html4 .rst-content table.docutils.citation,html.writer-html5 .rst-content aside.footnote,html.writer-html5 .rst-content aside.footnote-list aside.footnote,html.writer-html5 .rst-content div.citation-list>div.citation,html.writer-html5 .rst-content dl.citation,html.writer-html5 .rst-content dl.footnote{color:grey}.rst-content table.docutils.footnote code,.rst-content table.docutils.footnote tt,html.writer-html4 .rst-content table.docutils.citation code,html.writer-html4 .rst-content table.docutils.citation tt,html.writer-html5 .rst-content aside.footnote-list aside.footnote code,html.writer-html5 .rst-content aside.footnote-list aside.footnote tt,html.writer-html5 .rst-content aside.footnote code,html.writer-html5 .rst-content aside.footnote tt,html.writer-html5 .rst-content div.citation-list>div.citation code,html.writer-html5 .rst-content div.citation-list>div.citation tt,html.writer-html5 .rst-content dl.citation code,html.writer-html5 .rst-content dl.citation tt,html.writer-html5 .rst-content dl.footnote code,html.writer-html5 .rst-content dl.footnote tt{color:#555}.rst-content .wy-table-responsive.citation,.rst-content .wy-table-responsive.footnote{margin-bottom:0}.rst-content .wy-table-responsive.citation+:not(.citation),.rst-content .wy-table-responsive.footnote+:not(.footnote){margin-top:24px}.rst-content .wy-table-responsive.citation:last-child,.rst-content .wy-table-responsive.footnote:last-child{margin-bottom:24px}.rst-content table.docutils th{border-color:#e1e4e5}html.writer-html5 .rst-content table.docutils th{border:1px solid #e1e4e5}html.writer-html5 .rst-content table.docutils td>p,html.writer-html5 .rst-content table.docutils th>p{line-height:1rem;margin-bottom:0;font-size:.9rem}.rst-content table.docutils td .last,.rst-content table.docutils td .last>:last-child{margin-bottom:0}.rst-content table.field-list,.rst-content table.field-list td{border:none}.rst-content table.field-list td p{line-height:inherit}.rst-content table.field-list td>strong{display:inline-block}.rst-content table.field-list .field-name{padding-right:10px;text-align:left;white-space:nowrap}.rst-content table.field-list .field-body{text-align:left}.rst-content code,.rst-content tt{color:#000;font-family:SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,Courier,monospace;padding:2px 5px}.rst-content code big,.rst-content code em,.rst-content tt big,.rst-content tt em{font-size:100%!important;line-height:normal}.rst-content code.literal,.rst-content tt.literal{color:#e74c3c;white-space:normal}.rst-content code.xref,.rst-content tt.xref,a .rst-content code,a .rst-content tt{font-weight:700;color:#404040;overflow-wrap:normal}.rst-content kbd,.rst-content pre,.rst-content samp{font-family:SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,Courier,monospace}.rst-content a code,.rst-content a tt{color:#2980b9}.rst-content dl{margin-bottom:24px}.rst-content dl dt{font-weight:700;margin-bottom:12px}.rst-content dl ol,.rst-content dl p,.rst-content dl table,.rst-content dl ul{margin-bottom:12px}.rst-content dl dd{margin:0 0 12px 24px;line-height:24px}.rst-content dl dd>ol:last-child,.rst-content dl dd>p:last-child,.rst-content dl dd>table:last-child,.rst-content dl dd>ul:last-child{margin-bottom:0}html.writer-html4 .rst-content dl:not(.docutils),html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple){margin-bottom:24px}html.writer-html4 .rst-content dl:not(.docutils)>dt,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple)>dt{display:table;margin:6px 0;font-size:90%;line-height:normal;background:#e7f2fa;color:#2980b9;border-top:3px solid #6ab0de;padding:6px;position:relative}html.writer-html4 .rst-content dl:not(.docutils)>dt:before,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple)>dt:before{color:#6ab0de}html.writer-html4 .rst-content dl:not(.docutils)>dt .headerlink,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple)>dt .headerlink{color:#404040;font-size:100%!important}html.writer-html4 .rst-content dl:not(.docutils) dl:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple)>dt,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) dl:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple)>dt{margin-bottom:6px;border:none;border-left:3px solid #ccc;background:#f0f0f0;color:#555}html.writer-html4 .rst-content dl:not(.docutils) dl:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple)>dt .headerlink,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) dl:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple)>dt .headerlink{color:#404040;font-size:100%!important}html.writer-html4 .rst-content dl:not(.docutils)>dt:first-child,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple)>dt:first-child{margin-top:0}html.writer-html4 .rst-content dl:not(.docutils) code.descclassname,html.writer-html4 .rst-content dl:not(.docutils) code.descname,html.writer-html4 .rst-content dl:not(.docutils) tt.descclassname,html.writer-html4 .rst-content dl:not(.docutils) tt.descname,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) code.descclassname,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) code.descname,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) tt.descclassname,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) tt.descname{background-color:transparent;border:none;padding:0;font-size:100%!important}html.writer-html4 .rst-content dl:not(.docutils) code.descname,html.writer-html4 .rst-content dl:not(.docutils) tt.descname,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) code.descname,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) tt.descname{font-weight:700}html.writer-html4 .rst-content dl:not(.docutils) .optional,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) .optional{display:inline-block;padding:0 4px;color:#000;font-weight:700}html.writer-html4 .rst-content dl:not(.docutils) .property,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) .property{display:inline-block;padding-right:8px;max-width:100%}html.writer-html4 .rst-content dl:not(.docutils) .k,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) .k{font-style:italic}html.writer-html4 .rst-content dl:not(.docutils) .descclassname,html.writer-html4 .rst-content dl:not(.docutils) .descname,html.writer-html4 .rst-content dl:not(.docutils) .sig-name,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) .descclassname,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) .descname,html.writer-html5 .rst-content dl[class]:not(.option-list):not(.field-list):not(.footnote):not(.citation):not(.glossary):not(.simple) .sig-name{font-family:SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,Courier,monospace;color:#000}.rst-content .viewcode-back,.rst-content .viewcode-link{display:inline-block;color:#27ae60;font-size:80%;padding-left:24px}.rst-content .viewcode-back{display:block;float:right}.rst-content p.rubric{margin-bottom:12px;font-weight:700}.rst-content code.download,.rst-content tt.download{background:inherit;padding:inherit;font-weight:400;font-family:inherit;font-size:inherit;color:inherit;border:inherit;white-space:inherit}.rst-content code.download span:first-child,.rst-content tt.download span:first-child{-webkit-font-smoothing:subpixel-antialiased}.rst-content code.download span:first-child:before,.rst-content tt.download span:first-child:before{margin-right:4px}.rst-content .guilabel{border:1px solid #7fbbe3;background:#e7f2fa;font-size:80%;font-weight:700;border-radius:4px;padding:2.4px 6px;margin:auto 2px}.rst-content :not(dl.option-list)>:not(dt):not(kbd):not(.kbd)>.kbd,.rst-content :not(dl.option-list)>:not(dt):not(kbd):not(.kbd)>kbd{color:inherit;font-size:80%;background-color:#fff;border:1px solid #a6a6a6;border-radius:4px;box-shadow:0 2px grey;padding:2.4px 6px;margin:auto 0}.rst-content .versionmodified{font-style:italic}@media screen and (max-width:480px){.rst-content .sidebar{width:100%}}span[id*=MathJax-Span]{color:#404040}.math{text-align:center}@font-face{font-family:Lato;src:url(fonts/lato-normal.woff2?bd03a2cc277bbbc338d464e679fe9942) format("woff2"),url(fonts/lato-normal.woff?27bd77b9162d388cb8d4c4217c7c5e2a) format("woff");font-weight:400;font-style:normal;font-display:block}@font-face{font-family:Lato;src:url(fonts/lato-bold.woff2?cccb897485813c7c256901dbca54ecf2) format("woff2"),url(fonts/lato-bold.woff?d878b6c29b10beca227e9eef4246111b) format("woff");font-weight:700;font-style:normal;font-display:block}@font-face{font-family:Lato;src:url(fonts/lato-bold-italic.woff2?0b6bb6725576b072c5d0b02ecdd1900d) format("woff2"),url(fonts/lato-bold-italic.woff?9c7e4e9eb485b4a121c760e61bc3707c) format("woff");font-weight:700;font-style:italic;font-display:block}@font-face{font-family:Lato;src:url(fonts/lato-normal-italic.woff2?4eb103b4d12be57cb1d040ed5e162e9d) format("woff2"),url(fonts/lato-normal-italic.woff?f28f2d6482446544ef1ea1ccc6dd5892) format("woff");font-weight:400;font-style:italic;font-display:block}@font-face{font-family:Roboto Slab;font-style:normal;font-weight:400;src:url(fonts/Roboto-Slab-Regular.woff2?7abf5b8d04d26a2cafea937019bca958) format("woff2"),url(fonts/Roboto-Slab-Regular.woff?c1be9284088d487c5e3ff0a10a92e58c) format("woff");font-display:block}@font-face{font-family:Roboto Slab;font-style:normal;font-weight:700;src:url(fonts/Roboto-Slab-Bold.woff2?9984f4a9bda09be08e83f2506954adbe) format("woff2"),url(fonts/Roboto-Slab-Bold.woff?bed5564a116b05148e3b3bea6fb1162a) format("woff");font-display:block}
diff --git a/css/theme_extra.css b/css/theme_extra.css
new file mode 100644
index 00000000..9f4b063c
--- /dev/null
+++ b/css/theme_extra.css
@@ -0,0 +1,191 @@
+/*
+ * Wrap inline code samples otherwise they shoot of the side and
+ * can't be read at all.
+ *
+ * https://github.com/mkdocs/mkdocs/issues/313
+ * https://github.com/mkdocs/mkdocs/issues/233
+ * https://github.com/mkdocs/mkdocs/issues/834
+ */
+.rst-content code {
+    white-space: pre-wrap;
+    word-wrap: break-word;
+    padding: 2px 5px;
+}
+
+/**
+ * Make code blocks display as blocks and give them the appropriate
+ * font size and padding.
+ *
+ * https://github.com/mkdocs/mkdocs/issues/855
+ * https://github.com/mkdocs/mkdocs/issues/834
+ * https://github.com/mkdocs/mkdocs/issues/233
+ */
+.rst-content pre code {
+    white-space: pre;
+    word-wrap: normal;
+    display: block;
+    padding: 12px;
+    font-size: 12px;
+}
+
+/**
+ * Fix code colors
+ *
+ * https://github.com/mkdocs/mkdocs/issues/2027
+ */
+.rst-content code {
+    color: #E74C3C;
+}
+
+.rst-content pre code {
+    color: #000;
+    background: #f8f8f8;
+}
+
+/*
+ * Fix link colors when the link text is inline code.
+ *
+ * https://github.com/mkdocs/mkdocs/issues/718
+ */
+a code {
+    color: #2980B9;
+}
+a:hover code {
+    color: #3091d1;
+}
+a:visited code {
+    color: #9B59B6;
+}
+
+/*
+ * The CSS classes from highlight.js seem to clash with the
+ * ReadTheDocs theme causing some code to be incorrectly made
+ * bold and italic.
+ *
+ * https://github.com/mkdocs/mkdocs/issues/411
+ */
+pre .cs, pre .c {
+    font-weight: inherit;
+    font-style: inherit;
+}
+
+/*
+ * Fix some issues with the theme and non-highlighted code
+ * samples. Without and highlighting styles attached the
+ * formatting is broken.
+ *
+ * https://github.com/mkdocs/mkdocs/issues/319
+ */
+.rst-content .no-highlight {
+    display: block;
+    padding: 0.5em;
+    color: #333;
+}
+
+
+/*
+ * Additions specific to the search functionality provided by MkDocs
+ */
+
+.search-results {
+    margin-top: 23px;
+}
+
+.search-results article {
+    border-top: 1px solid #E1E4E5;
+    padding-top: 24px;
+}
+
+.search-results article:first-child {
+    border-top: none;
+}
+
+form .search-query {
+    width: 100%;
+    border-radius: 50px;
+    padding: 6px 12px;  /* csslint allow: box-model */
+    border-color: #D1D4D5;
+}
+
+/*
+ * Improve inline code blocks within admonitions.
+ *
+ * https://github.com/mkdocs/mkdocs/issues/656
+ */
+ .rst-content .admonition code {
+    color: #404040;
+    border: 1px solid #c7c9cb;
+    border: 1px solid rgba(0, 0, 0, 0.2);
+    background: #f8fbfd;
+    background: rgba(255, 255, 255, 0.7);
+}
+
+/*
+ * Account for wide tables which go off the side.
+ * Override borders to avoid weirdness on narrow tables.
+ *
+ * https://github.com/mkdocs/mkdocs/issues/834
+ * https://github.com/mkdocs/mkdocs/pull/1034
+ */
+.rst-content .section .docutils {
+    width: 100%;
+    overflow: auto;
+    display: block;
+    border: none;
+}
+
+td, th {
+    border: 1px solid #e1e4e5 !important; /* csslint allow: important */
+    border-collapse: collapse;
+}
+
+/*
+ * Without the following amendments, the navigation in the theme will be
+ * slightly cut off. This is due to the fact that the .wy-nav-side has a
+ * padding-bottom of 2em, which must not necessarily align with the font-size of
+ * 90 % on the .rst-current-version container, combined with the padding of 12px
+ * above and below. These amendments fix this in two steps: First, make sure the
+ * .rst-current-version container has a fixed height of 40px, achieved using
+ * line-height, and then applying a padding-bottom of 40px to this container. In
+ * a second step, the items within that container are re-aligned using flexbox.
+ *
+ * https://github.com/mkdocs/mkdocs/issues/2012
+ */
+ .wy-nav-side {
+    padding-bottom: 40px;
+}
+
+/*
+ * The second step of above amendment: Here we make sure the items are aligned
+ * correctly within the .rst-current-version container. Using flexbox, we
+ * achieve it in such a way that it will look like the following:
+ *
+ * [No repo_name]
+ *         Next >>                    // On the first page
+ * << Previous     Next >>            // On all subsequent pages
+ *
+ * [With repo_name]
+ *    <repo_name>        Next >>      // On the first page
+ * <repo_name>  << Previous  Next >>  // On all subsequent pages
+ *
+ * https://github.com/mkdocs/mkdocs/issues/2012
+ */
+.rst-versions .rst-current-version {
+    padding: 0 12px;
+    display: flex;
+    font-size: initial;
+    justify-content: space-between;
+    align-items: center;
+    line-height: 40px;
+}
+
+/*
+ * Please note that this amendment also involves removing certain inline-styles
+ * from the file ./mkdocs/themes/readthedocs/versions.html.
+ *
+ * https://github.com/mkdocs/mkdocs/issues/2012
+ */
+.rst-current-version span {
+    flex: 1;
+    text-align: center;
+}
diff --git a/diagram/gen.py b/diagram/gen.py
new file mode 100644
index 00000000..37f0302d
--- /dev/null
+++ b/diagram/gen.py
@@ -0,0 +1,294 @@
+elen = -1
+vlen = -1
+row_index = 0
+col_index = 0
+
+row_space = 100
+index_x = 20
+arg_x = 100
+desc_y_off = 15
+element_x = 10
+element_x_unit = 2
+col_space = -1
+box_height = 26
+
+
+def init(f, rows, cols):
+    global col_space
+    if vlen == 256:
+        col_space = 570
+    else:
+        col_space = 300
+    print(
+        f"""<svg version="1.1"
+     width="{cols * col_space + arg_x}" height="{rows * row_space + 50}"
+     xmlns="http://www.w3.org/2000/svg">""",
+        file=f,
+    )
+
+
+def end(f):
+    global row_index, elen, vlen
+    row_index = 0
+    elen = -1
+    vlen = -1
+    print(
+        "</svg>",
+        file=f,
+    )
+
+
+def add_row(f):
+    global row_index, col_index
+    # row index
+    row_index = row_index + 1
+    col_index = 0
+    print(
+        f'<text x="{index_x}" y="{row_space * row_index}">({row_index})</text>', file=f
+    )
+
+
+def add_box(f, arg, desc, indices=None):
+    global col_index
+    x_base = arg_x + col_index * col_space
+    col_index += 1
+    # arg name
+    print(
+        f'<text x="{x_base}" y="{row_space * row_index}" text-anchor="end" font-style="italic">{arg}</text>',
+        file=f,
+    )
+    # desc name
+    print(
+        f'<text x="{x_base}" y="{row_space * row_index + desc_y_off}" text-anchor="end">{desc}</text>',
+        file=f,
+    )
+    # add element rects
+    for i in range(vlen // elen):
+        per_element_x = element_x_unit * elen
+        print(
+            f'<rect x="{x_base + element_x + per_element_x * i}" y="{row_space * row_index - 10}" width="{per_element_x}" height="{box_height}" fill="white" stroke="blue" />',
+            file=f,
+        )
+        if indices is not None:
+            print(
+                f'<text x="{x_base + element_x + per_element_x * i + per_element_x // 2}" y="{row_space * row_index - 10 + box_height // 2}" text-anchor="middle" dominant-baseline="middle">{indices[i]}</text>',
+                file=f,
+            )
+
+    # add upper/lower indicator
+    if row_index == 1:
+        print(
+            f'<text x="{x_base + element_x}" y="{row_space * row_index - 20}" text-anchor="begin">upper</text>',
+            file=f,
+        )
+        print(
+            f'<text x="{x_base + element_x + element_x_unit * vlen}" y="{row_space * row_index - 20}" text-anchor="end">lower</text>',
+            file=f,
+        )
+
+
+def add_line(f, from_row, from_col, from_index, to_row, to_col, to_index):
+    x_base = arg_x + from_col * col_space
+    per_element_x = element_x_unit * elen
+    x1 = x_base + element_x + per_element_x * from_index + per_element_x // 2
+    y1 = row_space * from_row - 10 + box_height
+
+    x_base = arg_x + to_col * col_space
+    per_element_x = element_x_unit * elen
+    x2 = x_base + element_x + per_element_x * to_index + per_element_x // 2
+    y2 = row_space * to_row - 10
+
+    print(f'<line x1="{x1}" y1="{y1}" x2="{x2}" y2="{y2}" stroke="black" />', file=f)
+
+
+def xvshuf():
+    global elen, vlen
+    for el, name in [(64, "d"), (32, "w"), (16, "h"), (8, "b")]:
+        elen = el
+        vlen = 256
+        with open(f"xvshuf_{name}.svg", "w") as f:
+            init(f, 4, 2)
+            add_row(f)
+            add_box(
+                f,
+                "b" if elen > 8 else "a",
+                "data",
+                indices=list(range(vlen // elen - 1, vlen // elen // 2 - 1, -1)) * 2,
+            )
+            add_box(
+                f,
+                "c" if elen > 8 else "b",
+                "data",
+                indices=list(range(vlen // elen // 2 - 1, -1, -1)) * 2,
+            )
+            add_row(f)
+            add_box(f, "hi", "merged", indices=list(range(vlen // elen - 1, -1, -1)))
+            add_box(f, "lo", "merged", indices=list(range(vlen // elen - 1, -1, -1)))
+            add_row(f)
+            add_box(f, "a" if elen > 8 else "c", "indices")
+            add_row(f)
+            add_box(f, "ret", "returns")
+
+            # b to merge & c to merge
+            for i in range(vlen // elen // 2):
+                add_line(f, 1, 0, i, 2, 0, i)
+                add_line(f, 1, 0, i + vlen // elen // 2, 2, 1, i)
+            for i in range(vlen // elen // 2):
+                add_line(f, 1, 1, i, 2, 0, i + vlen // elen // 2)
+                add_line(f, 1, 1, i + vlen // elen // 2, 2, 1, i + vlen // elen // 2)
+
+            # merge to a
+            for i in range(vlen // elen):
+                for j in range(vlen // elen // 2):
+                    add_line(f, 2, 0, i, 3, 0, j)
+                    add_line(f, 2, 1, i, 3, 0, j + vlen // elen // 2)
+
+            # a to ret
+            for i in range(vlen // elen):
+                add_line(f, 3, 0, i, 4, 0, i)
+            end(f)
+
+
+def xvshuf4i_bhw():
+    global elen, vlen
+    for vl, prefix in [(256, "xv"), (128, "v")]:
+        for el, name in [(32, "w"), (16, "h"), (8, "b")]:
+            elen = el
+            vlen = vl
+            with open(f"{prefix}shuf4i_{name}.svg", "w") as f:
+                init(f, 2, 1)
+                add_row(f)
+                add_box(
+                    f,
+                    "a",
+                    "data",
+                    indices=list(range(3, -1, -1)) * (vlen // el // 4),
+                )
+                add_row(f)
+                add_box(f, "ret", "returns")
+
+                # a to returns
+                for i in range(vlen // elen):
+                    for j in range(4):
+                        add_line(f, 1, 0, i, 2, 0, i // 4 * 4 + j)
+                end(f)
+
+
+def xvshuf4i_d():
+    global elen, vlen
+    elen = 64
+    vlen = 256
+    with open("xvshuf4i_d.svg", "w") as f:
+        init(f, 2, 2)
+        add_row(f)
+        add_box(
+            f,
+            "b",
+            "data",
+            indices=[3, 2, 3, 2],
+        )
+        add_box(
+            f,
+            "a",
+            "data",
+            indices=[1, 0, 1, 0],
+        )
+        add_row(f)
+        add_box(f, "ret", "returns")
+
+        # a & b to returns
+        add_line(f, 1, 0, 2, 2, 0, 3)
+        add_line(f, 1, 0, 3, 2, 0, 3)
+        add_line(f, 1, 1, 2, 2, 0, 3)
+        add_line(f, 1, 1, 3, 2, 0, 3)
+        add_line(f, 1, 0, 2, 2, 0, 2)
+        add_line(f, 1, 0, 3, 2, 0, 2)
+        add_line(f, 1, 1, 2, 2, 0, 2)
+        add_line(f, 1, 1, 3, 2, 0, 2)
+        add_line(f, 1, 0, 0, 2, 0, 1)
+        add_line(f, 1, 0, 1, 2, 0, 1)
+        add_line(f, 1, 1, 0, 2, 0, 1)
+        add_line(f, 1, 1, 1, 2, 0, 1)
+        add_line(f, 1, 0, 0, 2, 0, 0)
+        add_line(f, 1, 0, 1, 2, 0, 0)
+        add_line(f, 1, 1, 0, 2, 0, 0)
+        add_line(f, 1, 1, 1, 2, 0, 0)
+        end(f)
+
+
+def vshuf():
+    global elen, vlen
+    for el, name in [(64, "d"), (32, "w"), (16, "h"), (8, "b")]:
+        elen = el
+        vlen = 128
+        with open(f"vshuf_{name}.svg", "w") as f:
+            init(f, 3, 2)
+            add_row(f)
+            add_box(
+                f,
+                "b" if elen > 8 else "a",
+                "data",
+                indices=list(range(vlen // elen * 2 - 1, vlen // elen - 1, -1)),
+            )
+            add_box(
+                f,
+                "c" if elen > 8 else "b",
+                "data",
+                indices=list(range(vlen // elen - 1, -1, -1)),
+            )
+            add_row(f)
+            add_box(f, "a" if elen > 8 else "c", "indices")
+            add_row(f)
+            add_box(f, "ret", "returns")
+
+            # b to a & c to a
+            for i in range(vlen // elen):
+                for j in range(vlen // elen):
+                    add_line(f, 1, 0, j, 2, 0, i)
+                    add_line(f, 1, 1, j, 2, 0, i)
+
+            # a to ret
+            for i in range(vlen // elen):
+                add_line(f, 2, 0, i, 3, 0, i)
+            end(f)
+
+def vshuf4i_d():
+    global elen, vlen
+    elen = 64
+    vlen = 128
+    with open("vshuf4i_d.svg", "w") as f:
+        init(f, 2, 2)
+        add_row(f)
+        add_box(
+            f,
+            "b",
+            "data",
+            indices=[3, 2],
+        )
+        add_box(
+            f,
+            "a",
+            "data",
+            indices=[1, 0],
+        )
+        add_row(f)
+        add_box(f, "ret", "returns")
+
+        # a & b to returns
+        add_line(f, 1, 0, 0, 2, 0, 1)
+        add_line(f, 1, 0, 1, 2, 0, 1)
+        add_line(f, 1, 1, 0, 2, 0, 1)
+        add_line(f, 1, 1, 1, 2, 0, 1)
+        add_line(f, 1, 0, 0, 2, 0, 0)
+        add_line(f, 1, 0, 1, 2, 0, 0)
+        add_line(f, 1, 1, 0, 2, 0, 0)
+        add_line(f, 1, 1, 1, 2, 0, 0)
+        end(f)
+
+
+if __name__ == "__main__":
+    xvshuf()
+    xvshuf4i_bhw()
+    xvshuf4i_d()
+    vshuf()
+    vshuf4i_d()
diff --git a/diagram/vshuf4i_b.svg b/diagram/vshuf4i_b.svg
new file mode 100644
index 00000000..c2bc66b4
--- /dev/null
+++ b/diagram/vshuf4i_b.svg
@@ -0,0 +1,124 @@
+<svg version="1.1"
+     width="400" height="250"
+     xmlns="http://www.w3.org/2000/svg">
+<text x="20" y="100">(1)</text>
+<text x="100" y="100" text-anchor="end" font-style="italic">a</text>
+<text x="100" y="115" text-anchor="end">data</text>
+<rect x="110" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="118" y="103" text-anchor="middle" dominant-baseline="middle">3</text>
+<rect x="126" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="134" y="103" text-anchor="middle" dominant-baseline="middle">2</text>
+<rect x="142" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="150" y="103" text-anchor="middle" dominant-baseline="middle">1</text>
+<rect x="158" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="166" y="103" text-anchor="middle" dominant-baseline="middle">0</text>
+<rect x="174" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="182" y="103" text-anchor="middle" dominant-baseline="middle">3</text>
+<rect x="190" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="198" y="103" text-anchor="middle" dominant-baseline="middle">2</text>
+<rect x="206" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="214" y="103" text-anchor="middle" dominant-baseline="middle">1</text>
+<rect x="222" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="230" y="103" text-anchor="middle" dominant-baseline="middle">0</text>
+<rect x="238" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="246" y="103" text-anchor="middle" dominant-baseline="middle">3</text>
+<rect x="254" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="262" y="103" text-anchor="middle" dominant-baseline="middle">2</text>
+<rect x="270" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="278" y="103" text-anchor="middle" dominant-baseline="middle">1</text>
+<rect x="286" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="294" y="103" text-anchor="middle" dominant-baseline="middle">0</text>
+<rect x="302" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="310" y="103" text-anchor="middle" dominant-baseline="middle">3</text>
+<rect x="318" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="326" y="103" text-anchor="middle" dominant-baseline="middle">2</text>
+<rect x="334" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="342" y="103" text-anchor="middle" dominant-baseline="middle">1</text>
+<rect x="350" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="358" y="103" text-anchor="middle" dominant-baseline="middle">0</text>
+<text x="110" y="80" text-anchor="begin">upper</text>
+<text x="366" y="80" text-anchor="end">lower</text>
+<text x="20" y="200">(2)</text>
+<text x="100" y="200" text-anchor="end" font-style="italic">ret</text>
+<text x="100" y="215" text-anchor="end">returns</text>
+<rect x="110" y="190" width="16" height="26" fill="white" stroke="blue" />
+<rect x="126" y="190" width="16" height="26" fill="white" stroke="blue" />
+<rect x="142" y="190" width="16" height="26" fill="white" stroke="blue" />
+<rect x="158" y="190" width="16" height="26" fill="white" stroke="blue" />
+<rect x="174" y="190" width="16" height="26" fill="white" stroke="blue" />
+<rect x="190" y="190" width="16" height="26" fill="white" stroke="blue" />
+<rect x="206" y="190" width="16" height="26" fill="white" stroke="blue" />
+<rect x="222" y="190" width="16" height="26" fill="white" stroke="blue" />
+<rect x="238" y="190" width="16" height="26" fill="white" stroke="blue" />
+<rect x="254" y="190" width="16" height="26" fill="white" stroke="blue" />
+<rect x="270" y="190" width="16" height="26" fill="white" stroke="blue" />
+<rect x="286" y="190" width="16" height="26" fill="white" stroke="blue" />
+<rect x="302" y="190" width="16" height="26" fill="white" stroke="blue" />
+<rect x="318" y="190" width="16" height="26" fill="white" stroke="blue" />
+<rect x="334" y="190" width="16" height="26" fill="white" stroke="blue" />
+<rect x="350" y="190" width="16" height="26" fill="white" stroke="blue" />
+<line x1="118" y1="116" x2="118" y2="190" stroke="black" />
+<line x1="118" y1="116" x2="134" y2="190" stroke="black" />
+<line x1="118" y1="116" x2="150" y2="190" stroke="black" />
+<line x1="118" y1="116" x2="166" y2="190" stroke="black" />
+<line x1="134" y1="116" x2="118" y2="190" stroke="black" />
+<line x1="134" y1="116" x2="134" y2="190" stroke="black" />
+<line x1="134" y1="116" x2="150" y2="190" stroke="black" />
+<line x1="134" y1="116" x2="166" y2="190" stroke="black" />
+<line x1="150" y1="116" x2="118" y2="190" stroke="black" />
+<line x1="150" y1="116" x2="134" y2="190" stroke="black" />
+<line x1="150" y1="116" x2="150" y2="190" stroke="black" />
+<line x1="150" y1="116" x2="166" y2="190" stroke="black" />
+<line x1="166" y1="116" x2="118" y2="190" stroke="black" />
+<line x1="166" y1="116" x2="134" y2="190" stroke="black" />
+<line x1="166" y1="116" x2="150" y2="190" stroke="black" />
+<line x1="166" y1="116" x2="166" y2="190" stroke="black" />
+<line x1="182" y1="116" x2="182" y2="190" stroke="black" />
+<line x1="182" y1="116" x2="198" y2="190" stroke="black" />
+<line x1="182" y1="116" x2="214" y2="190" stroke="black" />
+<line x1="182" y1="116" x2="230" y2="190" stroke="black" />
+<line x1="198" y1="116" x2="182" y2="190" stroke="black" />
+<line x1="198" y1="116" x2="198" y2="190" stroke="black" />
+<line x1="198" y1="116" x2="214" y2="190" stroke="black" />
+<line x1="198" y1="116" x2="230" y2="190" stroke="black" />
+<line x1="214" y1="116" x2="182" y2="190" stroke="black" />
+<line x1="214" y1="116" x2="198" y2="190" stroke="black" />
+<line x1="214" y1="116" x2="214" y2="190" stroke="black" />
+<line x1="214" y1="116" x2="230" y2="190" stroke="black" />
+<line x1="230" y1="116" x2="182" y2="190" stroke="black" />
+<line x1="230" y1="116" x2="198" y2="190" stroke="black" />
+<line x1="230" y1="116" x2="214" y2="190" stroke="black" />
+<line x1="230" y1="116" x2="230" y2="190" stroke="black" />
+<line x1="246" y1="116" x2="246" y2="190" stroke="black" />
+<line x1="246" y1="116" x2="262" y2="190" stroke="black" />
+<line x1="246" y1="116" x2="278" y2="190" stroke="black" />
+<line x1="246" y1="116" x2="294" y2="190" stroke="black" />
+<line x1="262" y1="116" x2="246" y2="190" stroke="black" />
+<line x1="262" y1="116" x2="262" y2="190" stroke="black" />
+<line x1="262" y1="116" x2="278" y2="190" stroke="black" />
+<line x1="262" y1="116" x2="294" y2="190" stroke="black" />
+<line x1="278" y1="116" x2="246" y2="190" stroke="black" />
+<line x1="278" y1="116" x2="262" y2="190" stroke="black" />
+<line x1="278" y1="116" x2="278" y2="190" stroke="black" />
+<line x1="278" y1="116" x2="294" y2="190" stroke="black" />
+<line x1="294" y1="116" x2="246" y2="190" stroke="black" />
+<line x1="294" y1="116" x2="262" y2="190" stroke="black" />
+<line x1="294" y1="116" x2="278" y2="190" stroke="black" />
+<line x1="294" y1="116" x2="294" y2="190" stroke="black" />
+<line x1="310" y1="116" x2="310" y2="190" stroke="black" />
+<line x1="310" y1="116" x2="326" y2="190" stroke="black" />
+<line x1="310" y1="116" x2="342" y2="190" stroke="black" />
+<line x1="310" y1="116" x2="358" y2="190" stroke="black" />
+<line x1="326" y1="116" x2="310" y2="190" stroke="black" />
+<line x1="326" y1="116" x2="326" y2="190" stroke="black" />
+<line x1="326" y1="116" x2="342" y2="190" stroke="black" />
+<line x1="326" y1="116" x2="358" y2="190" stroke="black" />
+<line x1="342" y1="116" x2="310" y2="190" stroke="black" />
+<line x1="342" y1="116" x2="326" y2="190" stroke="black" />
+<line x1="342" y1="116" x2="342" y2="190" stroke="black" />
+<line x1="342" y1="116" x2="358" y2="190" stroke="black" />
+<line x1="358" y1="116" x2="310" y2="190" stroke="black" />
+<line x1="358" y1="116" x2="326" y2="190" stroke="black" />
+<line x1="358" y1="116" x2="342" y2="190" stroke="black" />
+<line x1="358" y1="116" x2="358" y2="190" stroke="black" />
+</svg>
diff --git a/diagram/vshuf4i_d.svg b/diagram/vshuf4i_d.svg
new file mode 100644
index 00000000..10a24caf
--- /dev/null
+++ b/diagram/vshuf4i_d.svg
@@ -0,0 +1,34 @@
+<svg version="1.1"
+     width="700" height="250"
+     xmlns="http://www.w3.org/2000/svg">
+<text x="20" y="100">(1)</text>
+<text x="100" y="100" text-anchor="end" font-style="italic">b</text>
+<text x="100" y="115" text-anchor="end">data</text>
+<rect x="110" y="90" width="128" height="26" fill="white" stroke="blue" />
+<text x="174" y="103" text-anchor="middle" dominant-baseline="middle">3</text>
+<rect x="238" y="90" width="128" height="26" fill="white" stroke="blue" />
+<text x="302" y="103" text-anchor="middle" dominant-baseline="middle">2</text>
+<text x="110" y="80" text-anchor="begin">upper</text>
+<text x="366" y="80" text-anchor="end">lower</text>
+<text x="400" y="100" text-anchor="end" font-style="italic">a</text>
+<text x="400" y="115" text-anchor="end">data</text>
+<rect x="410" y="90" width="128" height="26" fill="white" stroke="blue" />
+<text x="474" y="103" text-anchor="middle" dominant-baseline="middle">1</text>
+<rect x="538" y="90" width="128" height="26" fill="white" stroke="blue" />
+<text x="602" y="103" text-anchor="middle" dominant-baseline="middle">0</text>
+<text x="410" y="80" text-anchor="begin">upper</text>
+<text x="666" y="80" text-anchor="end">lower</text>
+<text x="20" y="200">(2)</text>
+<text x="100" y="200" text-anchor="end" font-style="italic">ret</text>
+<text x="100" y="215" text-anchor="end">returns</text>
+<rect x="110" y="190" width="128" height="26" fill="white" stroke="blue" />
+<rect x="238" y="190" width="128" height="26" fill="white" stroke="blue" />
+<line x1="174" y1="116" x2="302" y2="190" stroke="black" />
+<line x1="302" y1="116" x2="302" y2="190" stroke="black" />
+<line x1="474" y1="116" x2="302" y2="190" stroke="black" />
+<line x1="602" y1="116" x2="302" y2="190" stroke="black" />
+<line x1="174" y1="116" x2="174" y2="190" stroke="black" />
+<line x1="302" y1="116" x2="174" y2="190" stroke="black" />
+<line x1="474" y1="116" x2="174" y2="190" stroke="black" />
+<line x1="602" y1="116" x2="174" y2="190" stroke="black" />
+</svg>
diff --git a/diagram/vshuf4i_h.svg b/diagram/vshuf4i_h.svg
new file mode 100644
index 00000000..3893dc4d
--- /dev/null
+++ b/diagram/vshuf4i_h.svg
@@ -0,0 +1,68 @@
+<svg version="1.1"
+     width="400" height="250"
+     xmlns="http://www.w3.org/2000/svg">
+<text x="20" y="100">(1)</text>
+<text x="100" y="100" text-anchor="end" font-style="italic">a</text>
+<text x="100" y="115" text-anchor="end">data</text>
+<rect x="110" y="90" width="32" height="26" fill="white" stroke="blue" />
+<text x="126" y="103" text-anchor="middle" dominant-baseline="middle">3</text>
+<rect x="142" y="90" width="32" height="26" fill="white" stroke="blue" />
+<text x="158" y="103" text-anchor="middle" dominant-baseline="middle">2</text>
+<rect x="174" y="90" width="32" height="26" fill="white" stroke="blue" />
+<text x="190" y="103" text-anchor="middle" dominant-baseline="middle">1</text>
+<rect x="206" y="90" width="32" height="26" fill="white" stroke="blue" />
+<text x="222" y="103" text-anchor="middle" dominant-baseline="middle">0</text>
+<rect x="238" y="90" width="32" height="26" fill="white" stroke="blue" />
+<text x="254" y="103" text-anchor="middle" dominant-baseline="middle">3</text>
+<rect x="270" y="90" width="32" height="26" fill="white" stroke="blue" />
+<text x="286" y="103" text-anchor="middle" dominant-baseline="middle">2</text>
+<rect x="302" y="90" width="32" height="26" fill="white" stroke="blue" />
+<text x="318" y="103" text-anchor="middle" dominant-baseline="middle">1</text>
+<rect x="334" y="90" width="32" height="26" fill="white" stroke="blue" />
+<text x="350" y="103" text-anchor="middle" dominant-baseline="middle">0</text>
+<text x="110" y="80" text-anchor="begin">upper</text>
+<text x="366" y="80" text-anchor="end">lower</text>
+<text x="20" y="200">(2)</text>
+<text x="100" y="200" text-anchor="end" font-style="italic">ret</text>
+<text x="100" y="215" text-anchor="end">returns</text>
+<rect x="110" y="190" width="32" height="26" fill="white" stroke="blue" />
+<rect x="142" y="190" width="32" height="26" fill="white" stroke="blue" />
+<rect x="174" y="190" width="32" height="26" fill="white" stroke="blue" />
+<rect x="206" y="190" width="32" height="26" fill="white" stroke="blue" />
+<rect x="238" y="190" width="32" height="26" fill="white" stroke="blue" />
+<rect x="270" y="190" width="32" height="26" fill="white" stroke="blue" />
+<rect x="302" y="190" width="32" height="26" fill="white" stroke="blue" />
+<rect x="334" y="190" width="32" height="26" fill="white" stroke="blue" />
+<line x1="126" y1="116" x2="126" y2="190" stroke="black" />
+<line x1="126" y1="116" x2="158" y2="190" stroke="black" />
+<line x1="126" y1="116" x2="190" y2="190" stroke="black" />
+<line x1="126" y1="116" x2="222" y2="190" stroke="black" />
+<line x1="158" y1="116" x2="126" y2="190" stroke="black" />
+<line x1="158" y1="116" x2="158" y2="190" stroke="black" />
+<line x1="158" y1="116" x2="190" y2="190" stroke="black" />
+<line x1="158" y1="116" x2="222" y2="190" stroke="black" />
+<line x1="190" y1="116" x2="126" y2="190" stroke="black" />
+<line x1="190" y1="116" x2="158" y2="190" stroke="black" />
+<line x1="190" y1="116" x2="190" y2="190" stroke="black" />
+<line x1="190" y1="116" x2="222" y2="190" stroke="black" />
+<line x1="222" y1="116" x2="126" y2="190" stroke="black" />
+<line x1="222" y1="116" x2="158" y2="190" stroke="black" />
+<line x1="222" y1="116" x2="190" y2="190" stroke="black" />
+<line x1="222" y1="116" x2="222" y2="190" stroke="black" />
+<line x1="254" y1="116" x2="254" y2="190" stroke="black" />
+<line x1="254" y1="116" x2="286" y2="190" stroke="black" />
+<line x1="254" y1="116" x2="318" y2="190" stroke="black" />
+<line x1="254" y1="116" x2="350" y2="190" stroke="black" />
+<line x1="286" y1="116" x2="254" y2="190" stroke="black" />
+<line x1="286" y1="116" x2="286" y2="190" stroke="black" />
+<line x1="286" y1="116" x2="318" y2="190" stroke="black" />
+<line x1="286" y1="116" x2="350" y2="190" stroke="black" />
+<line x1="318" y1="116" x2="254" y2="190" stroke="black" />
+<line x1="318" y1="116" x2="286" y2="190" stroke="black" />
+<line x1="318" y1="116" x2="318" y2="190" stroke="black" />
+<line x1="318" y1="116" x2="350" y2="190" stroke="black" />
+<line x1="350" y1="116" x2="254" y2="190" stroke="black" />
+<line x1="350" y1="116" x2="286" y2="190" stroke="black" />
+<line x1="350" y1="116" x2="318" y2="190" stroke="black" />
+<line x1="350" y1="116" x2="350" y2="190" stroke="black" />
+</svg>
diff --git a/diagram/vshuf4i_w.svg b/diagram/vshuf4i_w.svg
new file mode 100644
index 00000000..1ae56c19
--- /dev/null
+++ b/diagram/vshuf4i_w.svg
@@ -0,0 +1,40 @@
+<svg version="1.1"
+     width="400" height="250"
+     xmlns="http://www.w3.org/2000/svg">
+<text x="20" y="100">(1)</text>
+<text x="100" y="100" text-anchor="end" font-style="italic">a</text>
+<text x="100" y="115" text-anchor="end">data</text>
+<rect x="110" y="90" width="64" height="26" fill="white" stroke="blue" />
+<text x="142" y="103" text-anchor="middle" dominant-baseline="middle">3</text>
+<rect x="174" y="90" width="64" height="26" fill="white" stroke="blue" />
+<text x="206" y="103" text-anchor="middle" dominant-baseline="middle">2</text>
+<rect x="238" y="90" width="64" height="26" fill="white" stroke="blue" />
+<text x="270" y="103" text-anchor="middle" dominant-baseline="middle">1</text>
+<rect x="302" y="90" width="64" height="26" fill="white" stroke="blue" />
+<text x="334" y="103" text-anchor="middle" dominant-baseline="middle">0</text>
+<text x="110" y="80" text-anchor="begin">upper</text>
+<text x="366" y="80" text-anchor="end">lower</text>
+<text x="20" y="200">(2)</text>
+<text x="100" y="200" text-anchor="end" font-style="italic">ret</text>
+<text x="100" y="215" text-anchor="end">returns</text>
+<rect x="110" y="190" width="64" height="26" fill="white" stroke="blue" />
+<rect x="174" y="190" width="64" height="26" fill="white" stroke="blue" />
+<rect x="238" y="190" width="64" height="26" fill="white" stroke="blue" />
+<rect x="302" y="190" width="64" height="26" fill="white" stroke="blue" />
+<line x1="142" y1="116" x2="142" y2="190" stroke="black" />
+<line x1="142" y1="116" x2="206" y2="190" stroke="black" />
+<line x1="142" y1="116" x2="270" y2="190" stroke="black" />
+<line x1="142" y1="116" x2="334" y2="190" stroke="black" />
+<line x1="206" y1="116" x2="142" y2="190" stroke="black" />
+<line x1="206" y1="116" x2="206" y2="190" stroke="black" />
+<line x1="206" y1="116" x2="270" y2="190" stroke="black" />
+<line x1="206" y1="116" x2="334" y2="190" stroke="black" />
+<line x1="270" y1="116" x2="142" y2="190" stroke="black" />
+<line x1="270" y1="116" x2="206" y2="190" stroke="black" />
+<line x1="270" y1="116" x2="270" y2="190" stroke="black" />
+<line x1="270" y1="116" x2="334" y2="190" stroke="black" />
+<line x1="334" y1="116" x2="142" y2="190" stroke="black" />
+<line x1="334" y1="116" x2="206" y2="190" stroke="black" />
+<line x1="334" y1="116" x2="270" y2="190" stroke="black" />
+<line x1="334" y1="116" x2="334" y2="190" stroke="black" />
+</svg>
diff --git a/diagram/vshuf_b.svg b/diagram/vshuf_b.svg
new file mode 100644
index 00000000..8d80113f
--- /dev/null
+++ b/diagram/vshuf_b.svg
@@ -0,0 +1,643 @@
+<svg version="1.1"
+     width="700" height="350"
+     xmlns="http://www.w3.org/2000/svg">
+<text x="20" y="100">(1)</text>
+<text x="100" y="100" text-anchor="end" font-style="italic">a</text>
+<text x="100" y="115" text-anchor="end">data</text>
+<rect x="110" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="118" y="103" text-anchor="middle" dominant-baseline="middle">31</text>
+<rect x="126" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="134" y="103" text-anchor="middle" dominant-baseline="middle">30</text>
+<rect x="142" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="150" y="103" text-anchor="middle" dominant-baseline="middle">29</text>
+<rect x="158" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="166" y="103" text-anchor="middle" dominant-baseline="middle">28</text>
+<rect x="174" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="182" y="103" text-anchor="middle" dominant-baseline="middle">27</text>
+<rect x="190" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="198" y="103" text-anchor="middle" dominant-baseline="middle">26</text>
+<rect x="206" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="214" y="103" text-anchor="middle" dominant-baseline="middle">25</text>
+<rect x="222" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="230" y="103" text-anchor="middle" dominant-baseline="middle">24</text>
+<rect x="238" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="246" y="103" text-anchor="middle" dominant-baseline="middle">23</text>
+<rect x="254" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="262" y="103" text-anchor="middle" dominant-baseline="middle">22</text>
+<rect x="270" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="278" y="103" text-anchor="middle" dominant-baseline="middle">21</text>
+<rect x="286" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="294" y="103" text-anchor="middle" dominant-baseline="middle">20</text>
+<rect x="302" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="310" y="103" text-anchor="middle" dominant-baseline="middle">19</text>
+<rect x="318" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="326" y="103" text-anchor="middle" dominant-baseline="middle">18</text>
+<rect x="334" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="342" y="103" text-anchor="middle" dominant-baseline="middle">17</text>
+<rect x="350" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="358" y="103" text-anchor="middle" dominant-baseline="middle">16</text>
+<text x="110" y="80" text-anchor="begin">upper</text>
+<text x="366" y="80" text-anchor="end">lower</text>
+<text x="400" y="100" text-anchor="end" font-style="italic">b</text>
+<text x="400" y="115" text-anchor="end">data</text>
+<rect x="410" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="418" y="103" text-anchor="middle" dominant-baseline="middle">15</text>
+<rect x="426" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="434" y="103" text-anchor="middle" dominant-baseline="middle">14</text>
+<rect x="442" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="450" y="103" text-anchor="middle" dominant-baseline="middle">13</text>
+<rect x="458" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="466" y="103" text-anchor="middle" dominant-baseline="middle">12</text>
+<rect x="474" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="482" y="103" text-anchor="middle" dominant-baseline="middle">11</text>
+<rect x="490" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="498" y="103" text-anchor="middle" dominant-baseline="middle">10</text>
+<rect x="506" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="514" y="103" text-anchor="middle" dominant-baseline="middle">9</text>
+<rect x="522" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="530" y="103" text-anchor="middle" dominant-baseline="middle">8</text>
+<rect x="538" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="546" y="103" text-anchor="middle" dominant-baseline="middle">7</text>
+<rect x="554" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="562" y="103" text-anchor="middle" dominant-baseline="middle">6</text>
+<rect x="570" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="578" y="103" text-anchor="middle" dominant-baseline="middle">5</text>
+<rect x="586" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="594" y="103" text-anchor="middle" dominant-baseline="middle">4</text>
+<rect x="602" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="610" y="103" text-anchor="middle" dominant-baseline="middle">3</text>
+<rect x="618" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="626" y="103" text-anchor="middle" dominant-baseline="middle">2</text>
+<rect x="634" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="642" y="103" text-anchor="middle" dominant-baseline="middle">1</text>
+<rect x="650" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="658" y="103" text-anchor="middle" dominant-baseline="middle">0</text>
+<text x="410" y="80" text-anchor="begin">upper</text>
+<text x="666" y="80" text-anchor="end">lower</text>
+<text x="20" y="200">(2)</text>
+<text x="100" y="200" text-anchor="end" font-style="italic">c</text>
+<text x="100" y="215" text-anchor="end">indices</text>
+<rect x="110" y="190" width="16" height="26" fill="white" stroke="blue" />
+<rect x="126" y="190" width="16" height="26" fill="white" stroke="blue" />
+<rect x="142" y="190" width="16" height="26" fill="white" stroke="blue" />
+<rect x="158" y="190" width="16" height="26" fill="white" stroke="blue" />
+<rect x="174" y="190" width="16" height="26" fill="white" stroke="blue" />
+<rect x="190" y="190" width="16" height="26" fill="white" stroke="blue" />
+<rect x="206" y="190" width="16" height="26" fill="white" stroke="blue" />
+<rect x="222" y="190" width="16" height="26" fill="white" stroke="blue" />
+<rect x="238" y="190" width="16" height="26" fill="white" stroke="blue" />
+<rect x="254" y="190" width="16" height="26" fill="white" stroke="blue" />
+<rect x="270" y="190" width="16" height="26" fill="white" stroke="blue" />
+<rect x="286" y="190" width="16" height="26" fill="white" stroke="blue" />
+<rect x="302" y="190" width="16" height="26" fill="white" stroke="blue" />
+<rect x="318" y="190" width="16" height="26" fill="white" stroke="blue" />
+<rect x="334" y="190" width="16" height="26" fill="white" stroke="blue" />
+<rect x="350" y="190" width="16" height="26" fill="white" stroke="blue" />
+<text x="20" y="300">(3)</text>
+<text x="100" y="300" text-anchor="end" font-style="italic">ret</text>
+<text x="100" y="315" text-anchor="end">returns</text>
+<rect x="110" y="290" width="16" height="26" fill="white" stroke="blue" />
+<rect x="126" y="290" width="16" height="26" fill="white" stroke="blue" />
+<rect x="142" y="290" width="16" height="26" fill="white" stroke="blue" />
+<rect x="158" y="290" width="16" height="26" fill="white" stroke="blue" />
+<rect x="174" y="290" width="16" height="26" fill="white" stroke="blue" />
+<rect x="190" y="290" width="16" height="26" fill="white" stroke="blue" />
+<rect x="206" y="290" width="16" height="26" fill="white" stroke="blue" />
+<rect x="222" y="290" width="16" height="26" fill="white" stroke="blue" />
+<rect x="238" y="290" width="16" height="26" fill="white" stroke="blue" />
+<rect x="254" y="290" width="16" height="26" fill="white" stroke="blue" />
+<rect x="270" y="290" width="16" height="26" fill="white" stroke="blue" />
+<rect x="286" y="290" width="16" height="26" fill="white" stroke="blue" />
+<rect x="302" y="290" width="16" height="26" fill="white" stroke="blue" />
+<rect x="318" y="290" width="16" height="26" fill="white" stroke="blue" />
+<rect x="334" y="290" width="16" height="26" fill="white" stroke="blue" />
+<rect x="350" y="290" width="16" height="26" fill="white" stroke="blue" />
+<line x1="118" y1="116" x2="118" y2="190" stroke="black" />
+<line x1="418" y1="116" x2="118" y2="190" stroke="black" />
+<line x1="134" y1="116" x2="118" y2="190" stroke="black" />
+<line x1="434" y1="116" x2="118" y2="190" stroke="black" />
+<line x1="150" y1="116" x2="118" y2="190" stroke="black" />
+<line x1="450" y1="116" x2="118" y2="190" stroke="black" />
+<line x1="166" y1="116" x2="118" y2="190" stroke="black" />
+<line x1="466" y1="116" x2="118" y2="190" stroke="black" />
+<line x1="182" y1="116" x2="118" y2="190" stroke="black" />
+<line x1="482" y1="116" x2="118" y2="190" stroke="black" />
+<line x1="198" y1="116" x2="118" y2="190" stroke="black" />
+<line x1="498" y1="116" x2="118" y2="190" stroke="black" />
+<line x1="214" y1="116" x2="118" y2="190" stroke="black" />
+<line x1="514" y1="116" x2="118" y2="190" stroke="black" />
+<line x1="230" y1="116" x2="118" y2="190" stroke="black" />
+<line x1="530" y1="116" x2="118" y2="190" stroke="black" />
+<line x1="246" y1="116" x2="118" y2="190" stroke="black" />
+<line x1="546" y1="116" x2="118" y2="190" stroke="black" />
+<line x1="262" y1="116" x2="118" y2="190" stroke="black" />
+<line x1="562" y1="116" x2="118" y2="190" stroke="black" />
+<line x1="278" y1="116" x2="118" y2="190" stroke="black" />
+<line x1="578" y1="116" x2="118" y2="190" stroke="black" />
+<line x1="294" y1="116" x2="118" y2="190" stroke="black" />
+<line x1="594" y1="116" x2="118" y2="190" stroke="black" />
+<line x1="310" y1="116" x2="118" y2="190" stroke="black" />
+<line x1="610" y1="116" x2="118" y2="190" stroke="black" />
+<line x1="326" y1="116" x2="118" y2="190" stroke="black" />
+<line x1="626" y1="116" x2="118" y2="190" stroke="black" />
+<line x1="342" y1="116" x2="118" y2="190" stroke="black" />
+<line x1="642" y1="116" x2="118" y2="190" stroke="black" />
+<line x1="358" y1="116" x2="118" y2="190" stroke="black" />
+<line x1="658" y1="116" x2="118" y2="190" stroke="black" />
+<line x1="118" y1="116" x2="134" y2="190" stroke="black" />
+<line x1="418" y1="116" x2="134" y2="190" stroke="black" />
+<line x1="134" y1="116" x2="134" y2="190" stroke="black" />
+<line x1="434" y1="116" x2="134" y2="190" stroke="black" />
+<line x1="150" y1="116" x2="134" y2="190" stroke="black" />
+<line x1="450" y1="116" x2="134" y2="190" stroke="black" />
+<line x1="166" y1="116" x2="134" y2="190" stroke="black" />
+<line x1="466" y1="116" x2="134" y2="190" stroke="black" />
+<line x1="182" y1="116" x2="134" y2="190" stroke="black" />
+<line x1="482" y1="116" x2="134" y2="190" stroke="black" />
+<line x1="198" y1="116" x2="134" y2="190" stroke="black" />
+<line x1="498" y1="116" x2="134" y2="190" stroke="black" />
+<line x1="214" y1="116" x2="134" y2="190" stroke="black" />
+<line x1="514" y1="116" x2="134" y2="190" stroke="black" />
+<line x1="230" y1="116" x2="134" y2="190" stroke="black" />
+<line x1="530" y1="116" x2="134" y2="190" stroke="black" />
+<line x1="246" y1="116" x2="134" y2="190" stroke="black" />
+<line x1="546" y1="116" x2="134" y2="190" stroke="black" />
+<line x1="262" y1="116" x2="134" y2="190" stroke="black" />
+<line x1="562" y1="116" x2="134" y2="190" stroke="black" />
+<line x1="278" y1="116" x2="134" y2="190" stroke="black" />
+<line x1="578" y1="116" x2="134" y2="190" stroke="black" />
+<line x1="294" y1="116" x2="134" y2="190" stroke="black" />
+<line x1="594" y1="116" x2="134" y2="190" stroke="black" />
+<line x1="310" y1="116" x2="134" y2="190" stroke="black" />
+<line x1="610" y1="116" x2="134" y2="190" stroke="black" />
+<line x1="326" y1="116" x2="134" y2="190" stroke="black" />
+<line x1="626" y1="116" x2="134" y2="190" stroke="black" />
+<line x1="342" y1="116" x2="134" y2="190" stroke="black" />
+<line x1="642" y1="116" x2="134" y2="190" stroke="black" />
+<line x1="358" y1="116" x2="134" y2="190" stroke="black" />
+<line x1="658" y1="116" x2="134" y2="190" stroke="black" />
+<line x1="118" y1="116" x2="150" y2="190" stroke="black" />
+<line x1="418" y1="116" x2="150" y2="190" stroke="black" />
+<line x1="134" y1="116" x2="150" y2="190" stroke="black" />
+<line x1="434" y1="116" x2="150" y2="190" stroke="black" />
+<line x1="150" y1="116" x2="150" y2="190" stroke="black" />
+<line x1="450" y1="116" x2="150" y2="190" stroke="black" />
+<line x1="166" y1="116" x2="150" y2="190" stroke="black" />
+<line x1="466" y1="116" x2="150" y2="190" stroke="black" />
+<line x1="182" y1="116" x2="150" y2="190" stroke="black" />
+<line x1="482" y1="116" x2="150" y2="190" stroke="black" />
+<line x1="198" y1="116" x2="150" y2="190" stroke="black" />
+<line x1="498" y1="116" x2="150" y2="190" stroke="black" />
+<line x1="214" y1="116" x2="150" y2="190" stroke="black" />
+<line x1="514" y1="116" x2="150" y2="190" stroke="black" />
+<line x1="230" y1="116" x2="150" y2="190" stroke="black" />
+<line x1="530" y1="116" x2="150" y2="190" stroke="black" />
+<line x1="246" y1="116" x2="150" y2="190" stroke="black" />
+<line x1="546" y1="116" x2="150" y2="190" stroke="black" />
+<line x1="262" y1="116" x2="150" y2="190" stroke="black" />
+<line x1="562" y1="116" x2="150" y2="190" stroke="black" />
+<line x1="278" y1="116" x2="150" y2="190" stroke="black" />
+<line x1="578" y1="116" x2="150" y2="190" stroke="black" />
+<line x1="294" y1="116" x2="150" y2="190" stroke="black" />
+<line x1="594" y1="116" x2="150" y2="190" stroke="black" />
+<line x1="310" y1="116" x2="150" y2="190" stroke="black" />
+<line x1="610" y1="116" x2="150" y2="190" stroke="black" />
+<line x1="326" y1="116" x2="150" y2="190" stroke="black" />
+<line x1="626" y1="116" x2="150" y2="190" stroke="black" />
+<line x1="342" y1="116" x2="150" y2="190" stroke="black" />
+<line x1="642" y1="116" x2="150" y2="190" stroke="black" />
+<line x1="358" y1="116" x2="150" y2="190" stroke="black" />
+<line x1="658" y1="116" x2="150" y2="190" stroke="black" />
+<line x1="118" y1="116" x2="166" y2="190" stroke="black" />
+<line x1="418" y1="116" x2="166" y2="190" stroke="black" />
+<line x1="134" y1="116" x2="166" y2="190" stroke="black" />
+<line x1="434" y1="116" x2="166" y2="190" stroke="black" />
+<line x1="150" y1="116" x2="166" y2="190" stroke="black" />
+<line x1="450" y1="116" x2="166" y2="190" stroke="black" />
+<line x1="166" y1="116" x2="166" y2="190" stroke="black" />
+<line x1="466" y1="116" x2="166" y2="190" stroke="black" />
+<line x1="182" y1="116" x2="166" y2="190" stroke="black" />
+<line x1="482" y1="116" x2="166" y2="190" stroke="black" />
+<line x1="198" y1="116" x2="166" y2="190" stroke="black" />
+<line x1="498" y1="116" x2="166" y2="190" stroke="black" />
+<line x1="214" y1="116" x2="166" y2="190" stroke="black" />
+<line x1="514" y1="116" x2="166" y2="190" stroke="black" />
+<line x1="230" y1="116" x2="166" y2="190" stroke="black" />
+<line x1="530" y1="116" x2="166" y2="190" stroke="black" />
+<line x1="246" y1="116" x2="166" y2="190" stroke="black" />
+<line x1="546" y1="116" x2="166" y2="190" stroke="black" />
+<line x1="262" y1="116" x2="166" y2="190" stroke="black" />
+<line x1="562" y1="116" x2="166" y2="190" stroke="black" />
+<line x1="278" y1="116" x2="166" y2="190" stroke="black" />
+<line x1="578" y1="116" x2="166" y2="190" stroke="black" />
+<line x1="294" y1="116" x2="166" y2="190" stroke="black" />
+<line x1="594" y1="116" x2="166" y2="190" stroke="black" />
+<line x1="310" y1="116" x2="166" y2="190" stroke="black" />
+<line x1="610" y1="116" x2="166" y2="190" stroke="black" />
+<line x1="326" y1="116" x2="166" y2="190" stroke="black" />
+<line x1="626" y1="116" x2="166" y2="190" stroke="black" />
+<line x1="342" y1="116" x2="166" y2="190" stroke="black" />
+<line x1="642" y1="116" x2="166" y2="190" stroke="black" />
+<line x1="358" y1="116" x2="166" y2="190" stroke="black" />
+<line x1="658" y1="116" x2="166" y2="190" stroke="black" />
+<line x1="118" y1="116" x2="182" y2="190" stroke="black" />
+<line x1="418" y1="116" x2="182" y2="190" stroke="black" />
+<line x1="134" y1="116" x2="182" y2="190" stroke="black" />
+<line x1="434" y1="116" x2="182" y2="190" stroke="black" />
+<line x1="150" y1="116" x2="182" y2="190" stroke="black" />
+<line x1="450" y1="116" x2="182" y2="190" stroke="black" />
+<line x1="166" y1="116" x2="182" y2="190" stroke="black" />
+<line x1="466" y1="116" x2="182" y2="190" stroke="black" />
+<line x1="182" y1="116" x2="182" y2="190" stroke="black" />
+<line x1="482" y1="116" x2="182" y2="190" stroke="black" />
+<line x1="198" y1="116" x2="182" y2="190" stroke="black" />
+<line x1="498" y1="116" x2="182" y2="190" stroke="black" />
+<line x1="214" y1="116" x2="182" y2="190" stroke="black" />
+<line x1="514" y1="116" x2="182" y2="190" stroke="black" />
+<line x1="230" y1="116" x2="182" y2="190" stroke="black" />
+<line x1="530" y1="116" x2="182" y2="190" stroke="black" />
+<line x1="246" y1="116" x2="182" y2="190" stroke="black" />
+<line x1="546" y1="116" x2="182" y2="190" stroke="black" />
+<line x1="262" y1="116" x2="182" y2="190" stroke="black" />
+<line x1="562" y1="116" x2="182" y2="190" stroke="black" />
+<line x1="278" y1="116" x2="182" y2="190" stroke="black" />
+<line x1="578" y1="116" x2="182" y2="190" stroke="black" />
+<line x1="294" y1="116" x2="182" y2="190" stroke="black" />
+<line x1="594" y1="116" x2="182" y2="190" stroke="black" />
+<line x1="310" y1="116" x2="182" y2="190" stroke="black" />
+<line x1="610" y1="116" x2="182" y2="190" stroke="black" />
+<line x1="326" y1="116" x2="182" y2="190" stroke="black" />
+<line x1="626" y1="116" x2="182" y2="190" stroke="black" />
+<line x1="342" y1="116" x2="182" y2="190" stroke="black" />
+<line x1="642" y1="116" x2="182" y2="190" stroke="black" />
+<line x1="358" y1="116" x2="182" y2="190" stroke="black" />
+<line x1="658" y1="116" x2="182" y2="190" stroke="black" />
+<line x1="118" y1="116" x2="198" y2="190" stroke="black" />
+<line x1="418" y1="116" x2="198" y2="190" stroke="black" />
+<line x1="134" y1="116" x2="198" y2="190" stroke="black" />
+<line x1="434" y1="116" x2="198" y2="190" stroke="black" />
+<line x1="150" y1="116" x2="198" y2="190" stroke="black" />
+<line x1="450" y1="116" x2="198" y2="190" stroke="black" />
+<line x1="166" y1="116" x2="198" y2="190" stroke="black" />
+<line x1="466" y1="116" x2="198" y2="190" stroke="black" />
+<line x1="182" y1="116" x2="198" y2="190" stroke="black" />
+<line x1="482" y1="116" x2="198" y2="190" stroke="black" />
+<line x1="198" y1="116" x2="198" y2="190" stroke="black" />
+<line x1="498" y1="116" x2="198" y2="190" stroke="black" />
+<line x1="214" y1="116" x2="198" y2="190" stroke="black" />
+<line x1="514" y1="116" x2="198" y2="190" stroke="black" />
+<line x1="230" y1="116" x2="198" y2="190" stroke="black" />
+<line x1="530" y1="116" x2="198" y2="190" stroke="black" />
+<line x1="246" y1="116" x2="198" y2="190" stroke="black" />
+<line x1="546" y1="116" x2="198" y2="190" stroke="black" />
+<line x1="262" y1="116" x2="198" y2="190" stroke="black" />
+<line x1="562" y1="116" x2="198" y2="190" stroke="black" />
+<line x1="278" y1="116" x2="198" y2="190" stroke="black" />
+<line x1="578" y1="116" x2="198" y2="190" stroke="black" />
+<line x1="294" y1="116" x2="198" y2="190" stroke="black" />
+<line x1="594" y1="116" x2="198" y2="190" stroke="black" />
+<line x1="310" y1="116" x2="198" y2="190" stroke="black" />
+<line x1="610" y1="116" x2="198" y2="190" stroke="black" />
+<line x1="326" y1="116" x2="198" y2="190" stroke="black" />
+<line x1="626" y1="116" x2="198" y2="190" stroke="black" />
+<line x1="342" y1="116" x2="198" y2="190" stroke="black" />
+<line x1="642" y1="116" x2="198" y2="190" stroke="black" />
+<line x1="358" y1="116" x2="198" y2="190" stroke="black" />
+<line x1="658" y1="116" x2="198" y2="190" stroke="black" />
+<line x1="118" y1="116" x2="214" y2="190" stroke="black" />
+<line x1="418" y1="116" x2="214" y2="190" stroke="black" />
+<line x1="134" y1="116" x2="214" y2="190" stroke="black" />
+<line x1="434" y1="116" x2="214" y2="190" stroke="black" />
+<line x1="150" y1="116" x2="214" y2="190" stroke="black" />
+<line x1="450" y1="116" x2="214" y2="190" stroke="black" />
+<line x1="166" y1="116" x2="214" y2="190" stroke="black" />
+<line x1="466" y1="116" x2="214" y2="190" stroke="black" />
+<line x1="182" y1="116" x2="214" y2="190" stroke="black" />
+<line x1="482" y1="116" x2="214" y2="190" stroke="black" />
+<line x1="198" y1="116" x2="214" y2="190" stroke="black" />
+<line x1="498" y1="116" x2="214" y2="190" stroke="black" />
+<line x1="214" y1="116" x2="214" y2="190" stroke="black" />
+<line x1="514" y1="116" x2="214" y2="190" stroke="black" />
+<line x1="230" y1="116" x2="214" y2="190" stroke="black" />
+<line x1="530" y1="116" x2="214" y2="190" stroke="black" />
+<line x1="246" y1="116" x2="214" y2="190" stroke="black" />
+<line x1="546" y1="116" x2="214" y2="190" stroke="black" />
+<line x1="262" y1="116" x2="214" y2="190" stroke="black" />
+<line x1="562" y1="116" x2="214" y2="190" stroke="black" />
+<line x1="278" y1="116" x2="214" y2="190" stroke="black" />
+<line x1="578" y1="116" x2="214" y2="190" stroke="black" />
+<line x1="294" y1="116" x2="214" y2="190" stroke="black" />
+<line x1="594" y1="116" x2="214" y2="190" stroke="black" />
+<line x1="310" y1="116" x2="214" y2="190" stroke="black" />
+<line x1="610" y1="116" x2="214" y2="190" stroke="black" />
+<line x1="326" y1="116" x2="214" y2="190" stroke="black" />
+<line x1="626" y1="116" x2="214" y2="190" stroke="black" />
+<line x1="342" y1="116" x2="214" y2="190" stroke="black" />
+<line x1="642" y1="116" x2="214" y2="190" stroke="black" />
+<line x1="358" y1="116" x2="214" y2="190" stroke="black" />
+<line x1="658" y1="116" x2="214" y2="190" stroke="black" />
+<line x1="118" y1="116" x2="230" y2="190" stroke="black" />
+<line x1="418" y1="116" x2="230" y2="190" stroke="black" />
+<line x1="134" y1="116" x2="230" y2="190" stroke="black" />
+<line x1="434" y1="116" x2="230" y2="190" stroke="black" />
+<line x1="150" y1="116" x2="230" y2="190" stroke="black" />
+<line x1="450" y1="116" x2="230" y2="190" stroke="black" />
+<line x1="166" y1="116" x2="230" y2="190" stroke="black" />
+<line x1="466" y1="116" x2="230" y2="190" stroke="black" />
+<line x1="182" y1="116" x2="230" y2="190" stroke="black" />
+<line x1="482" y1="116" x2="230" y2="190" stroke="black" />
+<line x1="198" y1="116" x2="230" y2="190" stroke="black" />
+<line x1="498" y1="116" x2="230" y2="190" stroke="black" />
+<line x1="214" y1="116" x2="230" y2="190" stroke="black" />
+<line x1="514" y1="116" x2="230" y2="190" stroke="black" />
+<line x1="230" y1="116" x2="230" y2="190" stroke="black" />
+<line x1="530" y1="116" x2="230" y2="190" stroke="black" />
+<line x1="246" y1="116" x2="230" y2="190" stroke="black" />
+<line x1="546" y1="116" x2="230" y2="190" stroke="black" />
+<line x1="262" y1="116" x2="230" y2="190" stroke="black" />
+<line x1="562" y1="116" x2="230" y2="190" stroke="black" />
+<line x1="278" y1="116" x2="230" y2="190" stroke="black" />
+<line x1="578" y1="116" x2="230" y2="190" stroke="black" />
+<line x1="294" y1="116" x2="230" y2="190" stroke="black" />
+<line x1="594" y1="116" x2="230" y2="190" stroke="black" />
+<line x1="310" y1="116" x2="230" y2="190" stroke="black" />
+<line x1="610" y1="116" x2="230" y2="190" stroke="black" />
+<line x1="326" y1="116" x2="230" y2="190" stroke="black" />
+<line x1="626" y1="116" x2="230" y2="190" stroke="black" />
+<line x1="342" y1="116" x2="230" y2="190" stroke="black" />
+<line x1="642" y1="116" x2="230" y2="190" stroke="black" />
+<line x1="358" y1="116" x2="230" y2="190" stroke="black" />
+<line x1="658" y1="116" x2="230" y2="190" stroke="black" />
+<line x1="118" y1="116" x2="246" y2="190" stroke="black" />
+<line x1="418" y1="116" x2="246" y2="190" stroke="black" />
+<line x1="134" y1="116" x2="246" y2="190" stroke="black" />
+<line x1="434" y1="116" x2="246" y2="190" stroke="black" />
+<line x1="150" y1="116" x2="246" y2="190" stroke="black" />
+<line x1="450" y1="116" x2="246" y2="190" stroke="black" />
+<line x1="166" y1="116" x2="246" y2="190" stroke="black" />
+<line x1="466" y1="116" x2="246" y2="190" stroke="black" />
+<line x1="182" y1="116" x2="246" y2="190" stroke="black" />
+<line x1="482" y1="116" x2="246" y2="190" stroke="black" />
+<line x1="198" y1="116" x2="246" y2="190" stroke="black" />
+<line x1="498" y1="116" x2="246" y2="190" stroke="black" />
+<line x1="214" y1="116" x2="246" y2="190" stroke="black" />
+<line x1="514" y1="116" x2="246" y2="190" stroke="black" />
+<line x1="230" y1="116" x2="246" y2="190" stroke="black" />
+<line x1="530" y1="116" x2="246" y2="190" stroke="black" />
+<line x1="246" y1="116" x2="246" y2="190" stroke="black" />
+<line x1="546" y1="116" x2="246" y2="190" stroke="black" />
+<line x1="262" y1="116" x2="246" y2="190" stroke="black" />
+<line x1="562" y1="116" x2="246" y2="190" stroke="black" />
+<line x1="278" y1="116" x2="246" y2="190" stroke="black" />
+<line x1="578" y1="116" x2="246" y2="190" stroke="black" />
+<line x1="294" y1="116" x2="246" y2="190" stroke="black" />
+<line x1="594" y1="116" x2="246" y2="190" stroke="black" />
+<line x1="310" y1="116" x2="246" y2="190" stroke="black" />
+<line x1="610" y1="116" x2="246" y2="190" stroke="black" />
+<line x1="326" y1="116" x2="246" y2="190" stroke="black" />
+<line x1="626" y1="116" x2="246" y2="190" stroke="black" />
+<line x1="342" y1="116" x2="246" y2="190" stroke="black" />
+<line x1="642" y1="116" x2="246" y2="190" stroke="black" />
+<line x1="358" y1="116" x2="246" y2="190" stroke="black" />
+<line x1="658" y1="116" x2="246" y2="190" stroke="black" />
+<line x1="118" y1="116" x2="262" y2="190" stroke="black" />
+<line x1="418" y1="116" x2="262" y2="190" stroke="black" />
+<line x1="134" y1="116" x2="262" y2="190" stroke="black" />
+<line x1="434" y1="116" x2="262" y2="190" stroke="black" />
+<line x1="150" y1="116" x2="262" y2="190" stroke="black" />
+<line x1="450" y1="116" x2="262" y2="190" stroke="black" />
+<line x1="166" y1="116" x2="262" y2="190" stroke="black" />
+<line x1="466" y1="116" x2="262" y2="190" stroke="black" />
+<line x1="182" y1="116" x2="262" y2="190" stroke="black" />
+<line x1="482" y1="116" x2="262" y2="190" stroke="black" />
+<line x1="198" y1="116" x2="262" y2="190" stroke="black" />
+<line x1="498" y1="116" x2="262" y2="190" stroke="black" />
+<line x1="214" y1="116" x2="262" y2="190" stroke="black" />
+<line x1="514" y1="116" x2="262" y2="190" stroke="black" />
+<line x1="230" y1="116" x2="262" y2="190" stroke="black" />
+<line x1="530" y1="116" x2="262" y2="190" stroke="black" />
+<line x1="246" y1="116" x2="262" y2="190" stroke="black" />
+<line x1="546" y1="116" x2="262" y2="190" stroke="black" />
+<line x1="262" y1="116" x2="262" y2="190" stroke="black" />
+<line x1="562" y1="116" x2="262" y2="190" stroke="black" />
+<line x1="278" y1="116" x2="262" y2="190" stroke="black" />
+<line x1="578" y1="116" x2="262" y2="190" stroke="black" />
+<line x1="294" y1="116" x2="262" y2="190" stroke="black" />
+<line x1="594" y1="116" x2="262" y2="190" stroke="black" />
+<line x1="310" y1="116" x2="262" y2="190" stroke="black" />
+<line x1="610" y1="116" x2="262" y2="190" stroke="black" />
+<line x1="326" y1="116" x2="262" y2="190" stroke="black" />
+<line x1="626" y1="116" x2="262" y2="190" stroke="black" />
+<line x1="342" y1="116" x2="262" y2="190" stroke="black" />
+<line x1="642" y1="116" x2="262" y2="190" stroke="black" />
+<line x1="358" y1="116" x2="262" y2="190" stroke="black" />
+<line x1="658" y1="116" x2="262" y2="190" stroke="black" />
+<line x1="118" y1="116" x2="278" y2="190" stroke="black" />
+<line x1="418" y1="116" x2="278" y2="190" stroke="black" />
+<line x1="134" y1="116" x2="278" y2="190" stroke="black" />
+<line x1="434" y1="116" x2="278" y2="190" stroke="black" />
+<line x1="150" y1="116" x2="278" y2="190" stroke="black" />
+<line x1="450" y1="116" x2="278" y2="190" stroke="black" />
+<line x1="166" y1="116" x2="278" y2="190" stroke="black" />
+<line x1="466" y1="116" x2="278" y2="190" stroke="black" />
+<line x1="182" y1="116" x2="278" y2="190" stroke="black" />
+<line x1="482" y1="116" x2="278" y2="190" stroke="black" />
+<line x1="198" y1="116" x2="278" y2="190" stroke="black" />
+<line x1="498" y1="116" x2="278" y2="190" stroke="black" />
+<line x1="214" y1="116" x2="278" y2="190" stroke="black" />
+<line x1="514" y1="116" x2="278" y2="190" stroke="black" />
+<line x1="230" y1="116" x2="278" y2="190" stroke="black" />
+<line x1="530" y1="116" x2="278" y2="190" stroke="black" />
+<line x1="246" y1="116" x2="278" y2="190" stroke="black" />
+<line x1="546" y1="116" x2="278" y2="190" stroke="black" />
+<line x1="262" y1="116" x2="278" y2="190" stroke="black" />
+<line x1="562" y1="116" x2="278" y2="190" stroke="black" />
+<line x1="278" y1="116" x2="278" y2="190" stroke="black" />
+<line x1="578" y1="116" x2="278" y2="190" stroke="black" />
+<line x1="294" y1="116" x2="278" y2="190" stroke="black" />
+<line x1="594" y1="116" x2="278" y2="190" stroke="black" />
+<line x1="310" y1="116" x2="278" y2="190" stroke="black" />
+<line x1="610" y1="116" x2="278" y2="190" stroke="black" />
+<line x1="326" y1="116" x2="278" y2="190" stroke="black" />
+<line x1="626" y1="116" x2="278" y2="190" stroke="black" />
+<line x1="342" y1="116" x2="278" y2="190" stroke="black" />
+<line x1="642" y1="116" x2="278" y2="190" stroke="black" />
+<line x1="358" y1="116" x2="278" y2="190" stroke="black" />
+<line x1="658" y1="116" x2="278" y2="190" stroke="black" />
+<line x1="118" y1="116" x2="294" y2="190" stroke="black" />
+<line x1="418" y1="116" x2="294" y2="190" stroke="black" />
+<line x1="134" y1="116" x2="294" y2="190" stroke="black" />
+<line x1="434" y1="116" x2="294" y2="190" stroke="black" />
+<line x1="150" y1="116" x2="294" y2="190" stroke="black" />
+<line x1="450" y1="116" x2="294" y2="190" stroke="black" />
+<line x1="166" y1="116" x2="294" y2="190" stroke="black" />
+<line x1="466" y1="116" x2="294" y2="190" stroke="black" />
+<line x1="182" y1="116" x2="294" y2="190" stroke="black" />
+<line x1="482" y1="116" x2="294" y2="190" stroke="black" />
+<line x1="198" y1="116" x2="294" y2="190" stroke="black" />
+<line x1="498" y1="116" x2="294" y2="190" stroke="black" />
+<line x1="214" y1="116" x2="294" y2="190" stroke="black" />
+<line x1="514" y1="116" x2="294" y2="190" stroke="black" />
+<line x1="230" y1="116" x2="294" y2="190" stroke="black" />
+<line x1="530" y1="116" x2="294" y2="190" stroke="black" />
+<line x1="246" y1="116" x2="294" y2="190" stroke="black" />
+<line x1="546" y1="116" x2="294" y2="190" stroke="black" />
+<line x1="262" y1="116" x2="294" y2="190" stroke="black" />
+<line x1="562" y1="116" x2="294" y2="190" stroke="black" />
+<line x1="278" y1="116" x2="294" y2="190" stroke="black" />
+<line x1="578" y1="116" x2="294" y2="190" stroke="black" />
+<line x1="294" y1="116" x2="294" y2="190" stroke="black" />
+<line x1="594" y1="116" x2="294" y2="190" stroke="black" />
+<line x1="310" y1="116" x2="294" y2="190" stroke="black" />
+<line x1="610" y1="116" x2="294" y2="190" stroke="black" />
+<line x1="326" y1="116" x2="294" y2="190" stroke="black" />
+<line x1="626" y1="116" x2="294" y2="190" stroke="black" />
+<line x1="342" y1="116" x2="294" y2="190" stroke="black" />
+<line x1="642" y1="116" x2="294" y2="190" stroke="black" />
+<line x1="358" y1="116" x2="294" y2="190" stroke="black" />
+<line x1="658" y1="116" x2="294" y2="190" stroke="black" />
+<line x1="118" y1="116" x2="310" y2="190" stroke="black" />
+<line x1="418" y1="116" x2="310" y2="190" stroke="black" />
+<line x1="134" y1="116" x2="310" y2="190" stroke="black" />
+<line x1="434" y1="116" x2="310" y2="190" stroke="black" />
+<line x1="150" y1="116" x2="310" y2="190" stroke="black" />
+<line x1="450" y1="116" x2="310" y2="190" stroke="black" />
+<line x1="166" y1="116" x2="310" y2="190" stroke="black" />
+<line x1="466" y1="116" x2="310" y2="190" stroke="black" />
+<line x1="182" y1="116" x2="310" y2="190" stroke="black" />
+<line x1="482" y1="116" x2="310" y2="190" stroke="black" />
+<line x1="198" y1="116" x2="310" y2="190" stroke="black" />
+<line x1="498" y1="116" x2="310" y2="190" stroke="black" />
+<line x1="214" y1="116" x2="310" y2="190" stroke="black" />
+<line x1="514" y1="116" x2="310" y2="190" stroke="black" />
+<line x1="230" y1="116" x2="310" y2="190" stroke="black" />
+<line x1="530" y1="116" x2="310" y2="190" stroke="black" />
+<line x1="246" y1="116" x2="310" y2="190" stroke="black" />
+<line x1="546" y1="116" x2="310" y2="190" stroke="black" />
+<line x1="262" y1="116" x2="310" y2="190" stroke="black" />
+<line x1="562" y1="116" x2="310" y2="190" stroke="black" />
+<line x1="278" y1="116" x2="310" y2="190" stroke="black" />
+<line x1="578" y1="116" x2="310" y2="190" stroke="black" />
+<line x1="294" y1="116" x2="310" y2="190" stroke="black" />
+<line x1="594" y1="116" x2="310" y2="190" stroke="black" />
+<line x1="310" y1="116" x2="310" y2="190" stroke="black" />
+<line x1="610" y1="116" x2="310" y2="190" stroke="black" />
+<line x1="326" y1="116" x2="310" y2="190" stroke="black" />
+<line x1="626" y1="116" x2="310" y2="190" stroke="black" />
+<line x1="342" y1="116" x2="310" y2="190" stroke="black" />
+<line x1="642" y1="116" x2="310" y2="190" stroke="black" />
+<line x1="358" y1="116" x2="310" y2="190" stroke="black" />
+<line x1="658" y1="116" x2="310" y2="190" stroke="black" />
+<line x1="118" y1="116" x2="326" y2="190" stroke="black" />
+<line x1="418" y1="116" x2="326" y2="190" stroke="black" />
+<line x1="134" y1="116" x2="326" y2="190" stroke="black" />
+<line x1="434" y1="116" x2="326" y2="190" stroke="black" />
+<line x1="150" y1="116" x2="326" y2="190" stroke="black" />
+<line x1="450" y1="116" x2="326" y2="190" stroke="black" />
+<line x1="166" y1="116" x2="326" y2="190" stroke="black" />
+<line x1="466" y1="116" x2="326" y2="190" stroke="black" />
+<line x1="182" y1="116" x2="326" y2="190" stroke="black" />
+<line x1="482" y1="116" x2="326" y2="190" stroke="black" />
+<line x1="198" y1="116" x2="326" y2="190" stroke="black" />
+<line x1="498" y1="116" x2="326" y2="190" stroke="black" />
+<line x1="214" y1="116" x2="326" y2="190" stroke="black" />
+<line x1="514" y1="116" x2="326" y2="190" stroke="black" />
+<line x1="230" y1="116" x2="326" y2="190" stroke="black" />
+<line x1="530" y1="116" x2="326" y2="190" stroke="black" />
+<line x1="246" y1="116" x2="326" y2="190" stroke="black" />
+<line x1="546" y1="116" x2="326" y2="190" stroke="black" />
+<line x1="262" y1="116" x2="326" y2="190" stroke="black" />
+<line x1="562" y1="116" x2="326" y2="190" stroke="black" />
+<line x1="278" y1="116" x2="326" y2="190" stroke="black" />
+<line x1="578" y1="116" x2="326" y2="190" stroke="black" />
+<line x1="294" y1="116" x2="326" y2="190" stroke="black" />
+<line x1="594" y1="116" x2="326" y2="190" stroke="black" />
+<line x1="310" y1="116" x2="326" y2="190" stroke="black" />
+<line x1="610" y1="116" x2="326" y2="190" stroke="black" />
+<line x1="326" y1="116" x2="326" y2="190" stroke="black" />
+<line x1="626" y1="116" x2="326" y2="190" stroke="black" />
+<line x1="342" y1="116" x2="326" y2="190" stroke="black" />
+<line x1="642" y1="116" x2="326" y2="190" stroke="black" />
+<line x1="358" y1="116" x2="326" y2="190" stroke="black" />
+<line x1="658" y1="116" x2="326" y2="190" stroke="black" />
+<line x1="118" y1="116" x2="342" y2="190" stroke="black" />
+<line x1="418" y1="116" x2="342" y2="190" stroke="black" />
+<line x1="134" y1="116" x2="342" y2="190" stroke="black" />
+<line x1="434" y1="116" x2="342" y2="190" stroke="black" />
+<line x1="150" y1="116" x2="342" y2="190" stroke="black" />
+<line x1="450" y1="116" x2="342" y2="190" stroke="black" />
+<line x1="166" y1="116" x2="342" y2="190" stroke="black" />
+<line x1="466" y1="116" x2="342" y2="190" stroke="black" />
+<line x1="182" y1="116" x2="342" y2="190" stroke="black" />
+<line x1="482" y1="116" x2="342" y2="190" stroke="black" />
+<line x1="198" y1="116" x2="342" y2="190" stroke="black" />
+<line x1="498" y1="116" x2="342" y2="190" stroke="black" />
+<line x1="214" y1="116" x2="342" y2="190" stroke="black" />
+<line x1="514" y1="116" x2="342" y2="190" stroke="black" />
+<line x1="230" y1="116" x2="342" y2="190" stroke="black" />
+<line x1="530" y1="116" x2="342" y2="190" stroke="black" />
+<line x1="246" y1="116" x2="342" y2="190" stroke="black" />
+<line x1="546" y1="116" x2="342" y2="190" stroke="black" />
+<line x1="262" y1="116" x2="342" y2="190" stroke="black" />
+<line x1="562" y1="116" x2="342" y2="190" stroke="black" />
+<line x1="278" y1="116" x2="342" y2="190" stroke="black" />
+<line x1="578" y1="116" x2="342" y2="190" stroke="black" />
+<line x1="294" y1="116" x2="342" y2="190" stroke="black" />
+<line x1="594" y1="116" x2="342" y2="190" stroke="black" />
+<line x1="310" y1="116" x2="342" y2="190" stroke="black" />
+<line x1="610" y1="116" x2="342" y2="190" stroke="black" />
+<line x1="326" y1="116" x2="342" y2="190" stroke="black" />
+<line x1="626" y1="116" x2="342" y2="190" stroke="black" />
+<line x1="342" y1="116" x2="342" y2="190" stroke="black" />
+<line x1="642" y1="116" x2="342" y2="190" stroke="black" />
+<line x1="358" y1="116" x2="342" y2="190" stroke="black" />
+<line x1="658" y1="116" x2="342" y2="190" stroke="black" />
+<line x1="118" y1="116" x2="358" y2="190" stroke="black" />
+<line x1="418" y1="116" x2="358" y2="190" stroke="black" />
+<line x1="134" y1="116" x2="358" y2="190" stroke="black" />
+<line x1="434" y1="116" x2="358" y2="190" stroke="black" />
+<line x1="150" y1="116" x2="358" y2="190" stroke="black" />
+<line x1="450" y1="116" x2="358" y2="190" stroke="black" />
+<line x1="166" y1="116" x2="358" y2="190" stroke="black" />
+<line x1="466" y1="116" x2="358" y2="190" stroke="black" />
+<line x1="182" y1="116" x2="358" y2="190" stroke="black" />
+<line x1="482" y1="116" x2="358" y2="190" stroke="black" />
+<line x1="198" y1="116" x2="358" y2="190" stroke="black" />
+<line x1="498" y1="116" x2="358" y2="190" stroke="black" />
+<line x1="214" y1="116" x2="358" y2="190" stroke="black" />
+<line x1="514" y1="116" x2="358" y2="190" stroke="black" />
+<line x1="230" y1="116" x2="358" y2="190" stroke="black" />
+<line x1="530" y1="116" x2="358" y2="190" stroke="black" />
+<line x1="246" y1="116" x2="358" y2="190" stroke="black" />
+<line x1="546" y1="116" x2="358" y2="190" stroke="black" />
+<line x1="262" y1="116" x2="358" y2="190" stroke="black" />
+<line x1="562" y1="116" x2="358" y2="190" stroke="black" />
+<line x1="278" y1="116" x2="358" y2="190" stroke="black" />
+<line x1="578" y1="116" x2="358" y2="190" stroke="black" />
+<line x1="294" y1="116" x2="358" y2="190" stroke="black" />
+<line x1="594" y1="116" x2="358" y2="190" stroke="black" />
+<line x1="310" y1="116" x2="358" y2="190" stroke="black" />
+<line x1="610" y1="116" x2="358" y2="190" stroke="black" />
+<line x1="326" y1="116" x2="358" y2="190" stroke="black" />
+<line x1="626" y1="116" x2="358" y2="190" stroke="black" />
+<line x1="342" y1="116" x2="358" y2="190" stroke="black" />
+<line x1="642" y1="116" x2="358" y2="190" stroke="black" />
+<line x1="358" y1="116" x2="358" y2="190" stroke="black" />
+<line x1="658" y1="116" x2="358" y2="190" stroke="black" />
+<line x1="118" y1="216" x2="118" y2="290" stroke="black" />
+<line x1="134" y1="216" x2="134" y2="290" stroke="black" />
+<line x1="150" y1="216" x2="150" y2="290" stroke="black" />
+<line x1="166" y1="216" x2="166" y2="290" stroke="black" />
+<line x1="182" y1="216" x2="182" y2="290" stroke="black" />
+<line x1="198" y1="216" x2="198" y2="290" stroke="black" />
+<line x1="214" y1="216" x2="214" y2="290" stroke="black" />
+<line x1="230" y1="216" x2="230" y2="290" stroke="black" />
+<line x1="246" y1="216" x2="246" y2="290" stroke="black" />
+<line x1="262" y1="216" x2="262" y2="290" stroke="black" />
+<line x1="278" y1="216" x2="278" y2="290" stroke="black" />
+<line x1="294" y1="216" x2="294" y2="290" stroke="black" />
+<line x1="310" y1="216" x2="310" y2="290" stroke="black" />
+<line x1="326" y1="216" x2="326" y2="290" stroke="black" />
+<line x1="342" y1="216" x2="342" y2="290" stroke="black" />
+<line x1="358" y1="216" x2="358" y2="290" stroke="black" />
+</svg>
diff --git a/diagram/vshuf_d.svg b/diagram/vshuf_d.svg
new file mode 100644
index 00000000..677c9548
--- /dev/null
+++ b/diagram/vshuf_d.svg
@@ -0,0 +1,41 @@
+<svg version="1.1"
+     width="700" height="350"
+     xmlns="http://www.w3.org/2000/svg">
+<text x="20" y="100">(1)</text>
+<text x="100" y="100" text-anchor="end" font-style="italic">b</text>
+<text x="100" y="115" text-anchor="end">data</text>
+<rect x="110" y="90" width="128" height="26" fill="white" stroke="blue" />
+<text x="174" y="103" text-anchor="middle" dominant-baseline="middle">3</text>
+<rect x="238" y="90" width="128" height="26" fill="white" stroke="blue" />
+<text x="302" y="103" text-anchor="middle" dominant-baseline="middle">2</text>
+<text x="110" y="80" text-anchor="begin">upper</text>
+<text x="366" y="80" text-anchor="end">lower</text>
+<text x="400" y="100" text-anchor="end" font-style="italic">c</text>
+<text x="400" y="115" text-anchor="end">data</text>
+<rect x="410" y="90" width="128" height="26" fill="white" stroke="blue" />
+<text x="474" y="103" text-anchor="middle" dominant-baseline="middle">1</text>
+<rect x="538" y="90" width="128" height="26" fill="white" stroke="blue" />
+<text x="602" y="103" text-anchor="middle" dominant-baseline="middle">0</text>
+<text x="410" y="80" text-anchor="begin">upper</text>
+<text x="666" y="80" text-anchor="end">lower</text>
+<text x="20" y="200">(2)</text>
+<text x="100" y="200" text-anchor="end" font-style="italic">a</text>
+<text x="100" y="215" text-anchor="end">indices</text>
+<rect x="110" y="190" width="128" height="26" fill="white" stroke="blue" />
+<rect x="238" y="190" width="128" height="26" fill="white" stroke="blue" />
+<text x="20" y="300">(3)</text>
+<text x="100" y="300" text-anchor="end" font-style="italic">ret</text>
+<text x="100" y="315" text-anchor="end">returns</text>
+<rect x="110" y="290" width="128" height="26" fill="white" stroke="blue" />
+<rect x="238" y="290" width="128" height="26" fill="white" stroke="blue" />
+<line x1="174" y1="116" x2="174" y2="190" stroke="black" />
+<line x1="474" y1="116" x2="174" y2="190" stroke="black" />
+<line x1="302" y1="116" x2="174" y2="190" stroke="black" />
+<line x1="602" y1="116" x2="174" y2="190" stroke="black" />
+<line x1="174" y1="116" x2="302" y2="190" stroke="black" />
+<line x1="474" y1="116" x2="302" y2="190" stroke="black" />
+<line x1="302" y1="116" x2="302" y2="190" stroke="black" />
+<line x1="602" y1="116" x2="302" y2="190" stroke="black" />
+<line x1="174" y1="216" x2="174" y2="290" stroke="black" />
+<line x1="302" y1="216" x2="302" y2="290" stroke="black" />
+</svg>
diff --git a/diagram/vshuf_h.svg b/diagram/vshuf_h.svg
new file mode 100644
index 00000000..5c528285
--- /dev/null
+++ b/diagram/vshuf_h.svg
@@ -0,0 +1,203 @@
+<svg version="1.1"
+     width="700" height="350"
+     xmlns="http://www.w3.org/2000/svg">
+<text x="20" y="100">(1)</text>
+<text x="100" y="100" text-anchor="end" font-style="italic">b</text>
+<text x="100" y="115" text-anchor="end">data</text>
+<rect x="110" y="90" width="32" height="26" fill="white" stroke="blue" />
+<text x="126" y="103" text-anchor="middle" dominant-baseline="middle">15</text>
+<rect x="142" y="90" width="32" height="26" fill="white" stroke="blue" />
+<text x="158" y="103" text-anchor="middle" dominant-baseline="middle">14</text>
+<rect x="174" y="90" width="32" height="26" fill="white" stroke="blue" />
+<text x="190" y="103" text-anchor="middle" dominant-baseline="middle">13</text>
+<rect x="206" y="90" width="32" height="26" fill="white" stroke="blue" />
+<text x="222" y="103" text-anchor="middle" dominant-baseline="middle">12</text>
+<rect x="238" y="90" width="32" height="26" fill="white" stroke="blue" />
+<text x="254" y="103" text-anchor="middle" dominant-baseline="middle">11</text>
+<rect x="270" y="90" width="32" height="26" fill="white" stroke="blue" />
+<text x="286" y="103" text-anchor="middle" dominant-baseline="middle">10</text>
+<rect x="302" y="90" width="32" height="26" fill="white" stroke="blue" />
+<text x="318" y="103" text-anchor="middle" dominant-baseline="middle">9</text>
+<rect x="334" y="90" width="32" height="26" fill="white" stroke="blue" />
+<text x="350" y="103" text-anchor="middle" dominant-baseline="middle">8</text>
+<text x="110" y="80" text-anchor="begin">upper</text>
+<text x="366" y="80" text-anchor="end">lower</text>
+<text x="400" y="100" text-anchor="end" font-style="italic">c</text>
+<text x="400" y="115" text-anchor="end">data</text>
+<rect x="410" y="90" width="32" height="26" fill="white" stroke="blue" />
+<text x="426" y="103" text-anchor="middle" dominant-baseline="middle">7</text>
+<rect x="442" y="90" width="32" height="26" fill="white" stroke="blue" />
+<text x="458" y="103" text-anchor="middle" dominant-baseline="middle">6</text>
+<rect x="474" y="90" width="32" height="26" fill="white" stroke="blue" />
+<text x="490" y="103" text-anchor="middle" dominant-baseline="middle">5</text>
+<rect x="506" y="90" width="32" height="26" fill="white" stroke="blue" />
+<text x="522" y="103" text-anchor="middle" dominant-baseline="middle">4</text>
+<rect x="538" y="90" width="32" height="26" fill="white" stroke="blue" />
+<text x="554" y="103" text-anchor="middle" dominant-baseline="middle">3</text>
+<rect x="570" y="90" width="32" height="26" fill="white" stroke="blue" />
+<text x="586" y="103" text-anchor="middle" dominant-baseline="middle">2</text>
+<rect x="602" y="90" width="32" height="26" fill="white" stroke="blue" />
+<text x="618" y="103" text-anchor="middle" dominant-baseline="middle">1</text>
+<rect x="634" y="90" width="32" height="26" fill="white" stroke="blue" />
+<text x="650" y="103" text-anchor="middle" dominant-baseline="middle">0</text>
+<text x="410" y="80" text-anchor="begin">upper</text>
+<text x="666" y="80" text-anchor="end">lower</text>
+<text x="20" y="200">(2)</text>
+<text x="100" y="200" text-anchor="end" font-style="italic">a</text>
+<text x="100" y="215" text-anchor="end">indices</text>
+<rect x="110" y="190" width="32" height="26" fill="white" stroke="blue" />
+<rect x="142" y="190" width="32" height="26" fill="white" stroke="blue" />
+<rect x="174" y="190" width="32" height="26" fill="white" stroke="blue" />
+<rect x="206" y="190" width="32" height="26" fill="white" stroke="blue" />
+<rect x="238" y="190" width="32" height="26" fill="white" stroke="blue" />
+<rect x="270" y="190" width="32" height="26" fill="white" stroke="blue" />
+<rect x="302" y="190" width="32" height="26" fill="white" stroke="blue" />
+<rect x="334" y="190" width="32" height="26" fill="white" stroke="blue" />
+<text x="20" y="300">(3)</text>
+<text x="100" y="300" text-anchor="end" font-style="italic">ret</text>
+<text x="100" y="315" text-anchor="end">returns</text>
+<rect x="110" y="290" width="32" height="26" fill="white" stroke="blue" />
+<rect x="142" y="290" width="32" height="26" fill="white" stroke="blue" />
+<rect x="174" y="290" width="32" height="26" fill="white" stroke="blue" />
+<rect x="206" y="290" width="32" height="26" fill="white" stroke="blue" />
+<rect x="238" y="290" width="32" height="26" fill="white" stroke="blue" />
+<rect x="270" y="290" width="32" height="26" fill="white" stroke="blue" />
+<rect x="302" y="290" width="32" height="26" fill="white" stroke="blue" />
+<rect x="334" y="290" width="32" height="26" fill="white" stroke="blue" />
+<line x1="126" y1="116" x2="126" y2="190" stroke="black" />
+<line x1="426" y1="116" x2="126" y2="190" stroke="black" />
+<line x1="158" y1="116" x2="126" y2="190" stroke="black" />
+<line x1="458" y1="116" x2="126" y2="190" stroke="black" />
+<line x1="190" y1="116" x2="126" y2="190" stroke="black" />
+<line x1="490" y1="116" x2="126" y2="190" stroke="black" />
+<line x1="222" y1="116" x2="126" y2="190" stroke="black" />
+<line x1="522" y1="116" x2="126" y2="190" stroke="black" />
+<line x1="254" y1="116" x2="126" y2="190" stroke="black" />
+<line x1="554" y1="116" x2="126" y2="190" stroke="black" />
+<line x1="286" y1="116" x2="126" y2="190" stroke="black" />
+<line x1="586" y1="116" x2="126" y2="190" stroke="black" />
+<line x1="318" y1="116" x2="126" y2="190" stroke="black" />
+<line x1="618" y1="116" x2="126" y2="190" stroke="black" />
+<line x1="350" y1="116" x2="126" y2="190" stroke="black" />
+<line x1="650" y1="116" x2="126" y2="190" stroke="black" />
+<line x1="126" y1="116" x2="158" y2="190" stroke="black" />
+<line x1="426" y1="116" x2="158" y2="190" stroke="black" />
+<line x1="158" y1="116" x2="158" y2="190" stroke="black" />
+<line x1="458" y1="116" x2="158" y2="190" stroke="black" />
+<line x1="190" y1="116" x2="158" y2="190" stroke="black" />
+<line x1="490" y1="116" x2="158" y2="190" stroke="black" />
+<line x1="222" y1="116" x2="158" y2="190" stroke="black" />
+<line x1="522" y1="116" x2="158" y2="190" stroke="black" />
+<line x1="254" y1="116" x2="158" y2="190" stroke="black" />
+<line x1="554" y1="116" x2="158" y2="190" stroke="black" />
+<line x1="286" y1="116" x2="158" y2="190" stroke="black" />
+<line x1="586" y1="116" x2="158" y2="190" stroke="black" />
+<line x1="318" y1="116" x2="158" y2="190" stroke="black" />
+<line x1="618" y1="116" x2="158" y2="190" stroke="black" />
+<line x1="350" y1="116" x2="158" y2="190" stroke="black" />
+<line x1="650" y1="116" x2="158" y2="190" stroke="black" />
+<line x1="126" y1="116" x2="190" y2="190" stroke="black" />
+<line x1="426" y1="116" x2="190" y2="190" stroke="black" />
+<line x1="158" y1="116" x2="190" y2="190" stroke="black" />
+<line x1="458" y1="116" x2="190" y2="190" stroke="black" />
+<line x1="190" y1="116" x2="190" y2="190" stroke="black" />
+<line x1="490" y1="116" x2="190" y2="190" stroke="black" />
+<line x1="222" y1="116" x2="190" y2="190" stroke="black" />
+<line x1="522" y1="116" x2="190" y2="190" stroke="black" />
+<line x1="254" y1="116" x2="190" y2="190" stroke="black" />
+<line x1="554" y1="116" x2="190" y2="190" stroke="black" />
+<line x1="286" y1="116" x2="190" y2="190" stroke="black" />
+<line x1="586" y1="116" x2="190" y2="190" stroke="black" />
+<line x1="318" y1="116" x2="190" y2="190" stroke="black" />
+<line x1="618" y1="116" x2="190" y2="190" stroke="black" />
+<line x1="350" y1="116" x2="190" y2="190" stroke="black" />
+<line x1="650" y1="116" x2="190" y2="190" stroke="black" />
+<line x1="126" y1="116" x2="222" y2="190" stroke="black" />
+<line x1="426" y1="116" x2="222" y2="190" stroke="black" />
+<line x1="158" y1="116" x2="222" y2="190" stroke="black" />
+<line x1="458" y1="116" x2="222" y2="190" stroke="black" />
+<line x1="190" y1="116" x2="222" y2="190" stroke="black" />
+<line x1="490" y1="116" x2="222" y2="190" stroke="black" />
+<line x1="222" y1="116" x2="222" y2="190" stroke="black" />
+<line x1="522" y1="116" x2="222" y2="190" stroke="black" />
+<line x1="254" y1="116" x2="222" y2="190" stroke="black" />
+<line x1="554" y1="116" x2="222" y2="190" stroke="black" />
+<line x1="286" y1="116" x2="222" y2="190" stroke="black" />
+<line x1="586" y1="116" x2="222" y2="190" stroke="black" />
+<line x1="318" y1="116" x2="222" y2="190" stroke="black" />
+<line x1="618" y1="116" x2="222" y2="190" stroke="black" />
+<line x1="350" y1="116" x2="222" y2="190" stroke="black" />
+<line x1="650" y1="116" x2="222" y2="190" stroke="black" />
+<line x1="126" y1="116" x2="254" y2="190" stroke="black" />
+<line x1="426" y1="116" x2="254" y2="190" stroke="black" />
+<line x1="158" y1="116" x2="254" y2="190" stroke="black" />
+<line x1="458" y1="116" x2="254" y2="190" stroke="black" />
+<line x1="190" y1="116" x2="254" y2="190" stroke="black" />
+<line x1="490" y1="116" x2="254" y2="190" stroke="black" />
+<line x1="222" y1="116" x2="254" y2="190" stroke="black" />
+<line x1="522" y1="116" x2="254" y2="190" stroke="black" />
+<line x1="254" y1="116" x2="254" y2="190" stroke="black" />
+<line x1="554" y1="116" x2="254" y2="190" stroke="black" />
+<line x1="286" y1="116" x2="254" y2="190" stroke="black" />
+<line x1="586" y1="116" x2="254" y2="190" stroke="black" />
+<line x1="318" y1="116" x2="254" y2="190" stroke="black" />
+<line x1="618" y1="116" x2="254" y2="190" stroke="black" />
+<line x1="350" y1="116" x2="254" y2="190" stroke="black" />
+<line x1="650" y1="116" x2="254" y2="190" stroke="black" />
+<line x1="126" y1="116" x2="286" y2="190" stroke="black" />
+<line x1="426" y1="116" x2="286" y2="190" stroke="black" />
+<line x1="158" y1="116" x2="286" y2="190" stroke="black" />
+<line x1="458" y1="116" x2="286" y2="190" stroke="black" />
+<line x1="190" y1="116" x2="286" y2="190" stroke="black" />
+<line x1="490" y1="116" x2="286" y2="190" stroke="black" />
+<line x1="222" y1="116" x2="286" y2="190" stroke="black" />
+<line x1="522" y1="116" x2="286" y2="190" stroke="black" />
+<line x1="254" y1="116" x2="286" y2="190" stroke="black" />
+<line x1="554" y1="116" x2="286" y2="190" stroke="black" />
+<line x1="286" y1="116" x2="286" y2="190" stroke="black" />
+<line x1="586" y1="116" x2="286" y2="190" stroke="black" />
+<line x1="318" y1="116" x2="286" y2="190" stroke="black" />
+<line x1="618" y1="116" x2="286" y2="190" stroke="black" />
+<line x1="350" y1="116" x2="286" y2="190" stroke="black" />
+<line x1="650" y1="116" x2="286" y2="190" stroke="black" />
+<line x1="126" y1="116" x2="318" y2="190" stroke="black" />
+<line x1="426" y1="116" x2="318" y2="190" stroke="black" />
+<line x1="158" y1="116" x2="318" y2="190" stroke="black" />
+<line x1="458" y1="116" x2="318" y2="190" stroke="black" />
+<line x1="190" y1="116" x2="318" y2="190" stroke="black" />
+<line x1="490" y1="116" x2="318" y2="190" stroke="black" />
+<line x1="222" y1="116" x2="318" y2="190" stroke="black" />
+<line x1="522" y1="116" x2="318" y2="190" stroke="black" />
+<line x1="254" y1="116" x2="318" y2="190" stroke="black" />
+<line x1="554" y1="116" x2="318" y2="190" stroke="black" />
+<line x1="286" y1="116" x2="318" y2="190" stroke="black" />
+<line x1="586" y1="116" x2="318" y2="190" stroke="black" />
+<line x1="318" y1="116" x2="318" y2="190" stroke="black" />
+<line x1="618" y1="116" x2="318" y2="190" stroke="black" />
+<line x1="350" y1="116" x2="318" y2="190" stroke="black" />
+<line x1="650" y1="116" x2="318" y2="190" stroke="black" />
+<line x1="126" y1="116" x2="350" y2="190" stroke="black" />
+<line x1="426" y1="116" x2="350" y2="190" stroke="black" />
+<line x1="158" y1="116" x2="350" y2="190" stroke="black" />
+<line x1="458" y1="116" x2="350" y2="190" stroke="black" />
+<line x1="190" y1="116" x2="350" y2="190" stroke="black" />
+<line x1="490" y1="116" x2="350" y2="190" stroke="black" />
+<line x1="222" y1="116" x2="350" y2="190" stroke="black" />
+<line x1="522" y1="116" x2="350" y2="190" stroke="black" />
+<line x1="254" y1="116" x2="350" y2="190" stroke="black" />
+<line x1="554" y1="116" x2="350" y2="190" stroke="black" />
+<line x1="286" y1="116" x2="350" y2="190" stroke="black" />
+<line x1="586" y1="116" x2="350" y2="190" stroke="black" />
+<line x1="318" y1="116" x2="350" y2="190" stroke="black" />
+<line x1="618" y1="116" x2="350" y2="190" stroke="black" />
+<line x1="350" y1="116" x2="350" y2="190" stroke="black" />
+<line x1="650" y1="116" x2="350" y2="190" stroke="black" />
+<line x1="126" y1="216" x2="126" y2="290" stroke="black" />
+<line x1="158" y1="216" x2="158" y2="290" stroke="black" />
+<line x1="190" y1="216" x2="190" y2="290" stroke="black" />
+<line x1="222" y1="216" x2="222" y2="290" stroke="black" />
+<line x1="254" y1="216" x2="254" y2="290" stroke="black" />
+<line x1="286" y1="216" x2="286" y2="290" stroke="black" />
+<line x1="318" y1="216" x2="318" y2="290" stroke="black" />
+<line x1="350" y1="216" x2="350" y2="290" stroke="black" />
+</svg>
diff --git a/diagram/vshuf_w.svg b/diagram/vshuf_w.svg
new file mode 100644
index 00000000..5ddf2000
--- /dev/null
+++ b/diagram/vshuf_w.svg
@@ -0,0 +1,79 @@
+<svg version="1.1"
+     width="700" height="350"
+     xmlns="http://www.w3.org/2000/svg">
+<text x="20" y="100">(1)</text>
+<text x="100" y="100" text-anchor="end" font-style="italic">b</text>
+<text x="100" y="115" text-anchor="end">data</text>
+<rect x="110" y="90" width="64" height="26" fill="white" stroke="blue" />
+<text x="142" y="103" text-anchor="middle" dominant-baseline="middle">7</text>
+<rect x="174" y="90" width="64" height="26" fill="white" stroke="blue" />
+<text x="206" y="103" text-anchor="middle" dominant-baseline="middle">6</text>
+<rect x="238" y="90" width="64" height="26" fill="white" stroke="blue" />
+<text x="270" y="103" text-anchor="middle" dominant-baseline="middle">5</text>
+<rect x="302" y="90" width="64" height="26" fill="white" stroke="blue" />
+<text x="334" y="103" text-anchor="middle" dominant-baseline="middle">4</text>
+<text x="110" y="80" text-anchor="begin">upper</text>
+<text x="366" y="80" text-anchor="end">lower</text>
+<text x="400" y="100" text-anchor="end" font-style="italic">c</text>
+<text x="400" y="115" text-anchor="end">data</text>
+<rect x="410" y="90" width="64" height="26" fill="white" stroke="blue" />
+<text x="442" y="103" text-anchor="middle" dominant-baseline="middle">3</text>
+<rect x="474" y="90" width="64" height="26" fill="white" stroke="blue" />
+<text x="506" y="103" text-anchor="middle" dominant-baseline="middle">2</text>
+<rect x="538" y="90" width="64" height="26" fill="white" stroke="blue" />
+<text x="570" y="103" text-anchor="middle" dominant-baseline="middle">1</text>
+<rect x="602" y="90" width="64" height="26" fill="white" stroke="blue" />
+<text x="634" y="103" text-anchor="middle" dominant-baseline="middle">0</text>
+<text x="410" y="80" text-anchor="begin">upper</text>
+<text x="666" y="80" text-anchor="end">lower</text>
+<text x="20" y="200">(2)</text>
+<text x="100" y="200" text-anchor="end" font-style="italic">a</text>
+<text x="100" y="215" text-anchor="end">indices</text>
+<rect x="110" y="190" width="64" height="26" fill="white" stroke="blue" />
+<rect x="174" y="190" width="64" height="26" fill="white" stroke="blue" />
+<rect x="238" y="190" width="64" height="26" fill="white" stroke="blue" />
+<rect x="302" y="190" width="64" height="26" fill="white" stroke="blue" />
+<text x="20" y="300">(3)</text>
+<text x="100" y="300" text-anchor="end" font-style="italic">ret</text>
+<text x="100" y="315" text-anchor="end">returns</text>
+<rect x="110" y="290" width="64" height="26" fill="white" stroke="blue" />
+<rect x="174" y="290" width="64" height="26" fill="white" stroke="blue" />
+<rect x="238" y="290" width="64" height="26" fill="white" stroke="blue" />
+<rect x="302" y="290" width="64" height="26" fill="white" stroke="blue" />
+<line x1="142" y1="116" x2="142" y2="190" stroke="black" />
+<line x1="442" y1="116" x2="142" y2="190" stroke="black" />
+<line x1="206" y1="116" x2="142" y2="190" stroke="black" />
+<line x1="506" y1="116" x2="142" y2="190" stroke="black" />
+<line x1="270" y1="116" x2="142" y2="190" stroke="black" />
+<line x1="570" y1="116" x2="142" y2="190" stroke="black" />
+<line x1="334" y1="116" x2="142" y2="190" stroke="black" />
+<line x1="634" y1="116" x2="142" y2="190" stroke="black" />
+<line x1="142" y1="116" x2="206" y2="190" stroke="black" />
+<line x1="442" y1="116" x2="206" y2="190" stroke="black" />
+<line x1="206" y1="116" x2="206" y2="190" stroke="black" />
+<line x1="506" y1="116" x2="206" y2="190" stroke="black" />
+<line x1="270" y1="116" x2="206" y2="190" stroke="black" />
+<line x1="570" y1="116" x2="206" y2="190" stroke="black" />
+<line x1="334" y1="116" x2="206" y2="190" stroke="black" />
+<line x1="634" y1="116" x2="206" y2="190" stroke="black" />
+<line x1="142" y1="116" x2="270" y2="190" stroke="black" />
+<line x1="442" y1="116" x2="270" y2="190" stroke="black" />
+<line x1="206" y1="116" x2="270" y2="190" stroke="black" />
+<line x1="506" y1="116" x2="270" y2="190" stroke="black" />
+<line x1="270" y1="116" x2="270" y2="190" stroke="black" />
+<line x1="570" y1="116" x2="270" y2="190" stroke="black" />
+<line x1="334" y1="116" x2="270" y2="190" stroke="black" />
+<line x1="634" y1="116" x2="270" y2="190" stroke="black" />
+<line x1="142" y1="116" x2="334" y2="190" stroke="black" />
+<line x1="442" y1="116" x2="334" y2="190" stroke="black" />
+<line x1="206" y1="116" x2="334" y2="190" stroke="black" />
+<line x1="506" y1="116" x2="334" y2="190" stroke="black" />
+<line x1="270" y1="116" x2="334" y2="190" stroke="black" />
+<line x1="570" y1="116" x2="334" y2="190" stroke="black" />
+<line x1="334" y1="116" x2="334" y2="190" stroke="black" />
+<line x1="634" y1="116" x2="334" y2="190" stroke="black" />
+<line x1="142" y1="216" x2="142" y2="290" stroke="black" />
+<line x1="206" y1="216" x2="206" y2="290" stroke="black" />
+<line x1="270" y1="216" x2="270" y2="290" stroke="black" />
+<line x1="334" y1="216" x2="334" y2="290" stroke="black" />
+</svg>
diff --git a/diagram/xvshuf4i_b.svg b/diagram/xvshuf4i_b.svg
new file mode 100644
index 00000000..e21a7c31
--- /dev/null
+++ b/diagram/xvshuf4i_b.svg
@@ -0,0 +1,236 @@
+<svg version="1.1"
+     width="670" height="250"
+     xmlns="http://www.w3.org/2000/svg">
+<text x="20" y="100">(1)</text>
+<text x="100" y="100" text-anchor="end" font-style="italic">a</text>
+<text x="100" y="115" text-anchor="end">data</text>
+<rect x="110" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="118" y="103" text-anchor="middle" dominant-baseline="middle">3</text>
+<rect x="126" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="134" y="103" text-anchor="middle" dominant-baseline="middle">2</text>
+<rect x="142" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="150" y="103" text-anchor="middle" dominant-baseline="middle">1</text>
+<rect x="158" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="166" y="103" text-anchor="middle" dominant-baseline="middle">0</text>
+<rect x="174" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="182" y="103" text-anchor="middle" dominant-baseline="middle">3</text>
+<rect x="190" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="198" y="103" text-anchor="middle" dominant-baseline="middle">2</text>
+<rect x="206" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="214" y="103" text-anchor="middle" dominant-baseline="middle">1</text>
+<rect x="222" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="230" y="103" text-anchor="middle" dominant-baseline="middle">0</text>
+<rect x="238" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="246" y="103" text-anchor="middle" dominant-baseline="middle">3</text>
+<rect x="254" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="262" y="103" text-anchor="middle" dominant-baseline="middle">2</text>
+<rect x="270" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="278" y="103" text-anchor="middle" dominant-baseline="middle">1</text>
+<rect x="286" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="294" y="103" text-anchor="middle" dominant-baseline="middle">0</text>
+<rect x="302" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="310" y="103" text-anchor="middle" dominant-baseline="middle">3</text>
+<rect x="318" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="326" y="103" text-anchor="middle" dominant-baseline="middle">2</text>
+<rect x="334" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="342" y="103" text-anchor="middle" dominant-baseline="middle">1</text>
+<rect x="350" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="358" y="103" text-anchor="middle" dominant-baseline="middle">0</text>
+<rect x="366" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="374" y="103" text-anchor="middle" dominant-baseline="middle">3</text>
+<rect x="382" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="390" y="103" text-anchor="middle" dominant-baseline="middle">2</text>
+<rect x="398" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="406" y="103" text-anchor="middle" dominant-baseline="middle">1</text>
+<rect x="414" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="422" y="103" text-anchor="middle" dominant-baseline="middle">0</text>
+<rect x="430" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="438" y="103" text-anchor="middle" dominant-baseline="middle">3</text>
+<rect x="446" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="454" y="103" text-anchor="middle" dominant-baseline="middle">2</text>
+<rect x="462" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="470" y="103" text-anchor="middle" dominant-baseline="middle">1</text>
+<rect x="478" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="486" y="103" text-anchor="middle" dominant-baseline="middle">0</text>
+<rect x="494" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="502" y="103" text-anchor="middle" dominant-baseline="middle">3</text>
+<rect x="510" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="518" y="103" text-anchor="middle" dominant-baseline="middle">2</text>
+<rect x="526" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="534" y="103" text-anchor="middle" dominant-baseline="middle">1</text>
+<rect x="542" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="550" y="103" text-anchor="middle" dominant-baseline="middle">0</text>
+<rect x="558" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="566" y="103" text-anchor="middle" dominant-baseline="middle">3</text>
+<rect x="574" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="582" y="103" text-anchor="middle" dominant-baseline="middle">2</text>
+<rect x="590" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="598" y="103" text-anchor="middle" dominant-baseline="middle">1</text>
+<rect x="606" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="614" y="103" text-anchor="middle" dominant-baseline="middle">0</text>
+<text x="110" y="80" text-anchor="begin">upper</text>
+<text x="622" y="80" text-anchor="end">lower</text>
+<text x="20" y="200">(2)</text>
+<text x="100" y="200" text-anchor="end" font-style="italic">ret</text>
+<text x="100" y="215" text-anchor="end">returns</text>
+<rect x="110" y="190" width="16" height="26" fill="white" stroke="blue" />
+<rect x="126" y="190" width="16" height="26" fill="white" stroke="blue" />
+<rect x="142" y="190" width="16" height="26" fill="white" stroke="blue" />
+<rect x="158" y="190" width="16" height="26" fill="white" stroke="blue" />
+<rect x="174" y="190" width="16" height="26" fill="white" stroke="blue" />
+<rect x="190" y="190" width="16" height="26" fill="white" stroke="blue" />
+<rect x="206" y="190" width="16" height="26" fill="white" stroke="blue" />
+<rect x="222" y="190" width="16" height="26" fill="white" stroke="blue" />
+<rect x="238" y="190" width="16" height="26" fill="white" stroke="blue" />
+<rect x="254" y="190" width="16" height="26" fill="white" stroke="blue" />
+<rect x="270" y="190" width="16" height="26" fill="white" stroke="blue" />
+<rect x="286" y="190" width="16" height="26" fill="white" stroke="blue" />
+<rect x="302" y="190" width="16" height="26" fill="white" stroke="blue" />
+<rect x="318" y="190" width="16" height="26" fill="white" stroke="blue" />
+<rect x="334" y="190" width="16" height="26" fill="white" stroke="blue" />
+<rect x="350" y="190" width="16" height="26" fill="white" stroke="blue" />
+<rect x="366" y="190" width="16" height="26" fill="white" stroke="blue" />
+<rect x="382" y="190" width="16" height="26" fill="white" stroke="blue" />
+<rect x="398" y="190" width="16" height="26" fill="white" stroke="blue" />
+<rect x="414" y="190" width="16" height="26" fill="white" stroke="blue" />
+<rect x="430" y="190" width="16" height="26" fill="white" stroke="blue" />
+<rect x="446" y="190" width="16" height="26" fill="white" stroke="blue" />
+<rect x="462" y="190" width="16" height="26" fill="white" stroke="blue" />
+<rect x="478" y="190" width="16" height="26" fill="white" stroke="blue" />
+<rect x="494" y="190" width="16" height="26" fill="white" stroke="blue" />
+<rect x="510" y="190" width="16" height="26" fill="white" stroke="blue" />
+<rect x="526" y="190" width="16" height="26" fill="white" stroke="blue" />
+<rect x="542" y="190" width="16" height="26" fill="white" stroke="blue" />
+<rect x="558" y="190" width="16" height="26" fill="white" stroke="blue" />
+<rect x="574" y="190" width="16" height="26" fill="white" stroke="blue" />
+<rect x="590" y="190" width="16" height="26" fill="white" stroke="blue" />
+<rect x="606" y="190" width="16" height="26" fill="white" stroke="blue" />
+<line x1="118" y1="116" x2="118" y2="190" stroke="black" />
+<line x1="118" y1="116" x2="134" y2="190" stroke="black" />
+<line x1="118" y1="116" x2="150" y2="190" stroke="black" />
+<line x1="118" y1="116" x2="166" y2="190" stroke="black" />
+<line x1="134" y1="116" x2="118" y2="190" stroke="black" />
+<line x1="134" y1="116" x2="134" y2="190" stroke="black" />
+<line x1="134" y1="116" x2="150" y2="190" stroke="black" />
+<line x1="134" y1="116" x2="166" y2="190" stroke="black" />
+<line x1="150" y1="116" x2="118" y2="190" stroke="black" />
+<line x1="150" y1="116" x2="134" y2="190" stroke="black" />
+<line x1="150" y1="116" x2="150" y2="190" stroke="black" />
+<line x1="150" y1="116" x2="166" y2="190" stroke="black" />
+<line x1="166" y1="116" x2="118" y2="190" stroke="black" />
+<line x1="166" y1="116" x2="134" y2="190" stroke="black" />
+<line x1="166" y1="116" x2="150" y2="190" stroke="black" />
+<line x1="166" y1="116" x2="166" y2="190" stroke="black" />
+<line x1="182" y1="116" x2="182" y2="190" stroke="black" />
+<line x1="182" y1="116" x2="198" y2="190" stroke="black" />
+<line x1="182" y1="116" x2="214" y2="190" stroke="black" />
+<line x1="182" y1="116" x2="230" y2="190" stroke="black" />
+<line x1="198" y1="116" x2="182" y2="190" stroke="black" />
+<line x1="198" y1="116" x2="198" y2="190" stroke="black" />
+<line x1="198" y1="116" x2="214" y2="190" stroke="black" />
+<line x1="198" y1="116" x2="230" y2="190" stroke="black" />
+<line x1="214" y1="116" x2="182" y2="190" stroke="black" />
+<line x1="214" y1="116" x2="198" y2="190" stroke="black" />
+<line x1="214" y1="116" x2="214" y2="190" stroke="black" />
+<line x1="214" y1="116" x2="230" y2="190" stroke="black" />
+<line x1="230" y1="116" x2="182" y2="190" stroke="black" />
+<line x1="230" y1="116" x2="198" y2="190" stroke="black" />
+<line x1="230" y1="116" x2="214" y2="190" stroke="black" />
+<line x1="230" y1="116" x2="230" y2="190" stroke="black" />
+<line x1="246" y1="116" x2="246" y2="190" stroke="black" />
+<line x1="246" y1="116" x2="262" y2="190" stroke="black" />
+<line x1="246" y1="116" x2="278" y2="190" stroke="black" />
+<line x1="246" y1="116" x2="294" y2="190" stroke="black" />
+<line x1="262" y1="116" x2="246" y2="190" stroke="black" />
+<line x1="262" y1="116" x2="262" y2="190" stroke="black" />
+<line x1="262" y1="116" x2="278" y2="190" stroke="black" />
+<line x1="262" y1="116" x2="294" y2="190" stroke="black" />
+<line x1="278" y1="116" x2="246" y2="190" stroke="black" />
+<line x1="278" y1="116" x2="262" y2="190" stroke="black" />
+<line x1="278" y1="116" x2="278" y2="190" stroke="black" />
+<line x1="278" y1="116" x2="294" y2="190" stroke="black" />
+<line x1="294" y1="116" x2="246" y2="190" stroke="black" />
+<line x1="294" y1="116" x2="262" y2="190" stroke="black" />
+<line x1="294" y1="116" x2="278" y2="190" stroke="black" />
+<line x1="294" y1="116" x2="294" y2="190" stroke="black" />
+<line x1="310" y1="116" x2="310" y2="190" stroke="black" />
+<line x1="310" y1="116" x2="326" y2="190" stroke="black" />
+<line x1="310" y1="116" x2="342" y2="190" stroke="black" />
+<line x1="310" y1="116" x2="358" y2="190" stroke="black" />
+<line x1="326" y1="116" x2="310" y2="190" stroke="black" />
+<line x1="326" y1="116" x2="326" y2="190" stroke="black" />
+<line x1="326" y1="116" x2="342" y2="190" stroke="black" />
+<line x1="326" y1="116" x2="358" y2="190" stroke="black" />
+<line x1="342" y1="116" x2="310" y2="190" stroke="black" />
+<line x1="342" y1="116" x2="326" y2="190" stroke="black" />
+<line x1="342" y1="116" x2="342" y2="190" stroke="black" />
+<line x1="342" y1="116" x2="358" y2="190" stroke="black" />
+<line x1="358" y1="116" x2="310" y2="190" stroke="black" />
+<line x1="358" y1="116" x2="326" y2="190" stroke="black" />
+<line x1="358" y1="116" x2="342" y2="190" stroke="black" />
+<line x1="358" y1="116" x2="358" y2="190" stroke="black" />
+<line x1="374" y1="116" x2="374" y2="190" stroke="black" />
+<line x1="374" y1="116" x2="390" y2="190" stroke="black" />
+<line x1="374" y1="116" x2="406" y2="190" stroke="black" />
+<line x1="374" y1="116" x2="422" y2="190" stroke="black" />
+<line x1="390" y1="116" x2="374" y2="190" stroke="black" />
+<line x1="390" y1="116" x2="390" y2="190" stroke="black" />
+<line x1="390" y1="116" x2="406" y2="190" stroke="black" />
+<line x1="390" y1="116" x2="422" y2="190" stroke="black" />
+<line x1="406" y1="116" x2="374" y2="190" stroke="black" />
+<line x1="406" y1="116" x2="390" y2="190" stroke="black" />
+<line x1="406" y1="116" x2="406" y2="190" stroke="black" />
+<line x1="406" y1="116" x2="422" y2="190" stroke="black" />
+<line x1="422" y1="116" x2="374" y2="190" stroke="black" />
+<line x1="422" y1="116" x2="390" y2="190" stroke="black" />
+<line x1="422" y1="116" x2="406" y2="190" stroke="black" />
+<line x1="422" y1="116" x2="422" y2="190" stroke="black" />
+<line x1="438" y1="116" x2="438" y2="190" stroke="black" />
+<line x1="438" y1="116" x2="454" y2="190" stroke="black" />
+<line x1="438" y1="116" x2="470" y2="190" stroke="black" />
+<line x1="438" y1="116" x2="486" y2="190" stroke="black" />
+<line x1="454" y1="116" x2="438" y2="190" stroke="black" />
+<line x1="454" y1="116" x2="454" y2="190" stroke="black" />
+<line x1="454" y1="116" x2="470" y2="190" stroke="black" />
+<line x1="454" y1="116" x2="486" y2="190" stroke="black" />
+<line x1="470" y1="116" x2="438" y2="190" stroke="black" />
+<line x1="470" y1="116" x2="454" y2="190" stroke="black" />
+<line x1="470" y1="116" x2="470" y2="190" stroke="black" />
+<line x1="470" y1="116" x2="486" y2="190" stroke="black" />
+<line x1="486" y1="116" x2="438" y2="190" stroke="black" />
+<line x1="486" y1="116" x2="454" y2="190" stroke="black" />
+<line x1="486" y1="116" x2="470" y2="190" stroke="black" />
+<line x1="486" y1="116" x2="486" y2="190" stroke="black" />
+<line x1="502" y1="116" x2="502" y2="190" stroke="black" />
+<line x1="502" y1="116" x2="518" y2="190" stroke="black" />
+<line x1="502" y1="116" x2="534" y2="190" stroke="black" />
+<line x1="502" y1="116" x2="550" y2="190" stroke="black" />
+<line x1="518" y1="116" x2="502" y2="190" stroke="black" />
+<line x1="518" y1="116" x2="518" y2="190" stroke="black" />
+<line x1="518" y1="116" x2="534" y2="190" stroke="black" />
+<line x1="518" y1="116" x2="550" y2="190" stroke="black" />
+<line x1="534" y1="116" x2="502" y2="190" stroke="black" />
+<line x1="534" y1="116" x2="518" y2="190" stroke="black" />
+<line x1="534" y1="116" x2="534" y2="190" stroke="black" />
+<line x1="534" y1="116" x2="550" y2="190" stroke="black" />
+<line x1="550" y1="116" x2="502" y2="190" stroke="black" />
+<line x1="550" y1="116" x2="518" y2="190" stroke="black" />
+<line x1="550" y1="116" x2="534" y2="190" stroke="black" />
+<line x1="550" y1="116" x2="550" y2="190" stroke="black" />
+<line x1="566" y1="116" x2="566" y2="190" stroke="black" />
+<line x1="566" y1="116" x2="582" y2="190" stroke="black" />
+<line x1="566" y1="116" x2="598" y2="190" stroke="black" />
+<line x1="566" y1="116" x2="614" y2="190" stroke="black" />
+<line x1="582" y1="116" x2="566" y2="190" stroke="black" />
+<line x1="582" y1="116" x2="582" y2="190" stroke="black" />
+<line x1="582" y1="116" x2="598" y2="190" stroke="black" />
+<line x1="582" y1="116" x2="614" y2="190" stroke="black" />
+<line x1="598" y1="116" x2="566" y2="190" stroke="black" />
+<line x1="598" y1="116" x2="582" y2="190" stroke="black" />
+<line x1="598" y1="116" x2="598" y2="190" stroke="black" />
+<line x1="598" y1="116" x2="614" y2="190" stroke="black" />
+<line x1="614" y1="116" x2="566" y2="190" stroke="black" />
+<line x1="614" y1="116" x2="582" y2="190" stroke="black" />
+<line x1="614" y1="116" x2="598" y2="190" stroke="black" />
+<line x1="614" y1="116" x2="614" y2="190" stroke="black" />
+</svg>
diff --git a/diagram/xvshuf4i_d.svg b/diagram/xvshuf4i_d.svg
new file mode 100644
index 00000000..af8ed307
--- /dev/null
+++ b/diagram/xvshuf4i_d.svg
@@ -0,0 +1,52 @@
+<svg version="1.1"
+     width="1240" height="250"
+     xmlns="http://www.w3.org/2000/svg">
+<text x="20" y="100">(1)</text>
+<text x="100" y="100" text-anchor="end" font-style="italic">b</text>
+<text x="100" y="115" text-anchor="end">data</text>
+<rect x="110" y="90" width="128" height="26" fill="white" stroke="blue" />
+<text x="174" y="103" text-anchor="middle" dominant-baseline="middle">3</text>
+<rect x="238" y="90" width="128" height="26" fill="white" stroke="blue" />
+<text x="302" y="103" text-anchor="middle" dominant-baseline="middle">2</text>
+<rect x="366" y="90" width="128" height="26" fill="white" stroke="blue" />
+<text x="430" y="103" text-anchor="middle" dominant-baseline="middle">3</text>
+<rect x="494" y="90" width="128" height="26" fill="white" stroke="blue" />
+<text x="558" y="103" text-anchor="middle" dominant-baseline="middle">2</text>
+<text x="110" y="80" text-anchor="begin">upper</text>
+<text x="622" y="80" text-anchor="end">lower</text>
+<text x="670" y="100" text-anchor="end" font-style="italic">a</text>
+<text x="670" y="115" text-anchor="end">data</text>
+<rect x="680" y="90" width="128" height="26" fill="white" stroke="blue" />
+<text x="744" y="103" text-anchor="middle" dominant-baseline="middle">1</text>
+<rect x="808" y="90" width="128" height="26" fill="white" stroke="blue" />
+<text x="872" y="103" text-anchor="middle" dominant-baseline="middle">0</text>
+<rect x="936" y="90" width="128" height="26" fill="white" stroke="blue" />
+<text x="1000" y="103" text-anchor="middle" dominant-baseline="middle">1</text>
+<rect x="1064" y="90" width="128" height="26" fill="white" stroke="blue" />
+<text x="1128" y="103" text-anchor="middle" dominant-baseline="middle">0</text>
+<text x="680" y="80" text-anchor="begin">upper</text>
+<text x="1192" y="80" text-anchor="end">lower</text>
+<text x="20" y="200">(2)</text>
+<text x="100" y="200" text-anchor="end" font-style="italic">ret</text>
+<text x="100" y="215" text-anchor="end">returns</text>
+<rect x="110" y="190" width="128" height="26" fill="white" stroke="blue" />
+<rect x="238" y="190" width="128" height="26" fill="white" stroke="blue" />
+<rect x="366" y="190" width="128" height="26" fill="white" stroke="blue" />
+<rect x="494" y="190" width="128" height="26" fill="white" stroke="blue" />
+<line x1="430" y1="116" x2="558" y2="190" stroke="black" />
+<line x1="558" y1="116" x2="558" y2="190" stroke="black" />
+<line x1="1000" y1="116" x2="558" y2="190" stroke="black" />
+<line x1="1128" y1="116" x2="558" y2="190" stroke="black" />
+<line x1="430" y1="116" x2="430" y2="190" stroke="black" />
+<line x1="558" y1="116" x2="430" y2="190" stroke="black" />
+<line x1="1000" y1="116" x2="430" y2="190" stroke="black" />
+<line x1="1128" y1="116" x2="430" y2="190" stroke="black" />
+<line x1="174" y1="116" x2="302" y2="190" stroke="black" />
+<line x1="302" y1="116" x2="302" y2="190" stroke="black" />
+<line x1="744" y1="116" x2="302" y2="190" stroke="black" />
+<line x1="872" y1="116" x2="302" y2="190" stroke="black" />
+<line x1="174" y1="116" x2="174" y2="190" stroke="black" />
+<line x1="302" y1="116" x2="174" y2="190" stroke="black" />
+<line x1="744" y1="116" x2="174" y2="190" stroke="black" />
+<line x1="872" y1="116" x2="174" y2="190" stroke="black" />
+</svg>
diff --git a/diagram/xvshuf4i_h.svg b/diagram/xvshuf4i_h.svg
new file mode 100644
index 00000000..2b4b1164
--- /dev/null
+++ b/diagram/xvshuf4i_h.svg
@@ -0,0 +1,124 @@
+<svg version="1.1"
+     width="670" height="250"
+     xmlns="http://www.w3.org/2000/svg">
+<text x="20" y="100">(1)</text>
+<text x="100" y="100" text-anchor="end" font-style="italic">a</text>
+<text x="100" y="115" text-anchor="end">data</text>
+<rect x="110" y="90" width="32" height="26" fill="white" stroke="blue" />
+<text x="126" y="103" text-anchor="middle" dominant-baseline="middle">3</text>
+<rect x="142" y="90" width="32" height="26" fill="white" stroke="blue" />
+<text x="158" y="103" text-anchor="middle" dominant-baseline="middle">2</text>
+<rect x="174" y="90" width="32" height="26" fill="white" stroke="blue" />
+<text x="190" y="103" text-anchor="middle" dominant-baseline="middle">1</text>
+<rect x="206" y="90" width="32" height="26" fill="white" stroke="blue" />
+<text x="222" y="103" text-anchor="middle" dominant-baseline="middle">0</text>
+<rect x="238" y="90" width="32" height="26" fill="white" stroke="blue" />
+<text x="254" y="103" text-anchor="middle" dominant-baseline="middle">3</text>
+<rect x="270" y="90" width="32" height="26" fill="white" stroke="blue" />
+<text x="286" y="103" text-anchor="middle" dominant-baseline="middle">2</text>
+<rect x="302" y="90" width="32" height="26" fill="white" stroke="blue" />
+<text x="318" y="103" text-anchor="middle" dominant-baseline="middle">1</text>
+<rect x="334" y="90" width="32" height="26" fill="white" stroke="blue" />
+<text x="350" y="103" text-anchor="middle" dominant-baseline="middle">0</text>
+<rect x="366" y="90" width="32" height="26" fill="white" stroke="blue" />
+<text x="382" y="103" text-anchor="middle" dominant-baseline="middle">3</text>
+<rect x="398" y="90" width="32" height="26" fill="white" stroke="blue" />
+<text x="414" y="103" text-anchor="middle" dominant-baseline="middle">2</text>
+<rect x="430" y="90" width="32" height="26" fill="white" stroke="blue" />
+<text x="446" y="103" text-anchor="middle" dominant-baseline="middle">1</text>
+<rect x="462" y="90" width="32" height="26" fill="white" stroke="blue" />
+<text x="478" y="103" text-anchor="middle" dominant-baseline="middle">0</text>
+<rect x="494" y="90" width="32" height="26" fill="white" stroke="blue" />
+<text x="510" y="103" text-anchor="middle" dominant-baseline="middle">3</text>
+<rect x="526" y="90" width="32" height="26" fill="white" stroke="blue" />
+<text x="542" y="103" text-anchor="middle" dominant-baseline="middle">2</text>
+<rect x="558" y="90" width="32" height="26" fill="white" stroke="blue" />
+<text x="574" y="103" text-anchor="middle" dominant-baseline="middle">1</text>
+<rect x="590" y="90" width="32" height="26" fill="white" stroke="blue" />
+<text x="606" y="103" text-anchor="middle" dominant-baseline="middle">0</text>
+<text x="110" y="80" text-anchor="begin">upper</text>
+<text x="622" y="80" text-anchor="end">lower</text>
+<text x="20" y="200">(2)</text>
+<text x="100" y="200" text-anchor="end" font-style="italic">ret</text>
+<text x="100" y="215" text-anchor="end">returns</text>
+<rect x="110" y="190" width="32" height="26" fill="white" stroke="blue" />
+<rect x="142" y="190" width="32" height="26" fill="white" stroke="blue" />
+<rect x="174" y="190" width="32" height="26" fill="white" stroke="blue" />
+<rect x="206" y="190" width="32" height="26" fill="white" stroke="blue" />
+<rect x="238" y="190" width="32" height="26" fill="white" stroke="blue" />
+<rect x="270" y="190" width="32" height="26" fill="white" stroke="blue" />
+<rect x="302" y="190" width="32" height="26" fill="white" stroke="blue" />
+<rect x="334" y="190" width="32" height="26" fill="white" stroke="blue" />
+<rect x="366" y="190" width="32" height="26" fill="white" stroke="blue" />
+<rect x="398" y="190" width="32" height="26" fill="white" stroke="blue" />
+<rect x="430" y="190" width="32" height="26" fill="white" stroke="blue" />
+<rect x="462" y="190" width="32" height="26" fill="white" stroke="blue" />
+<rect x="494" y="190" width="32" height="26" fill="white" stroke="blue" />
+<rect x="526" y="190" width="32" height="26" fill="white" stroke="blue" />
+<rect x="558" y="190" width="32" height="26" fill="white" stroke="blue" />
+<rect x="590" y="190" width="32" height="26" fill="white" stroke="blue" />
+<line x1="126" y1="116" x2="126" y2="190" stroke="black" />
+<line x1="126" y1="116" x2="158" y2="190" stroke="black" />
+<line x1="126" y1="116" x2="190" y2="190" stroke="black" />
+<line x1="126" y1="116" x2="222" y2="190" stroke="black" />
+<line x1="158" y1="116" x2="126" y2="190" stroke="black" />
+<line x1="158" y1="116" x2="158" y2="190" stroke="black" />
+<line x1="158" y1="116" x2="190" y2="190" stroke="black" />
+<line x1="158" y1="116" x2="222" y2="190" stroke="black" />
+<line x1="190" y1="116" x2="126" y2="190" stroke="black" />
+<line x1="190" y1="116" x2="158" y2="190" stroke="black" />
+<line x1="190" y1="116" x2="190" y2="190" stroke="black" />
+<line x1="190" y1="116" x2="222" y2="190" stroke="black" />
+<line x1="222" y1="116" x2="126" y2="190" stroke="black" />
+<line x1="222" y1="116" x2="158" y2="190" stroke="black" />
+<line x1="222" y1="116" x2="190" y2="190" stroke="black" />
+<line x1="222" y1="116" x2="222" y2="190" stroke="black" />
+<line x1="254" y1="116" x2="254" y2="190" stroke="black" />
+<line x1="254" y1="116" x2="286" y2="190" stroke="black" />
+<line x1="254" y1="116" x2="318" y2="190" stroke="black" />
+<line x1="254" y1="116" x2="350" y2="190" stroke="black" />
+<line x1="286" y1="116" x2="254" y2="190" stroke="black" />
+<line x1="286" y1="116" x2="286" y2="190" stroke="black" />
+<line x1="286" y1="116" x2="318" y2="190" stroke="black" />
+<line x1="286" y1="116" x2="350" y2="190" stroke="black" />
+<line x1="318" y1="116" x2="254" y2="190" stroke="black" />
+<line x1="318" y1="116" x2="286" y2="190" stroke="black" />
+<line x1="318" y1="116" x2="318" y2="190" stroke="black" />
+<line x1="318" y1="116" x2="350" y2="190" stroke="black" />
+<line x1="350" y1="116" x2="254" y2="190" stroke="black" />
+<line x1="350" y1="116" x2="286" y2="190" stroke="black" />
+<line x1="350" y1="116" x2="318" y2="190" stroke="black" />
+<line x1="350" y1="116" x2="350" y2="190" stroke="black" />
+<line x1="382" y1="116" x2="382" y2="190" stroke="black" />
+<line x1="382" y1="116" x2="414" y2="190" stroke="black" />
+<line x1="382" y1="116" x2="446" y2="190" stroke="black" />
+<line x1="382" y1="116" x2="478" y2="190" stroke="black" />
+<line x1="414" y1="116" x2="382" y2="190" stroke="black" />
+<line x1="414" y1="116" x2="414" y2="190" stroke="black" />
+<line x1="414" y1="116" x2="446" y2="190" stroke="black" />
+<line x1="414" y1="116" x2="478" y2="190" stroke="black" />
+<line x1="446" y1="116" x2="382" y2="190" stroke="black" />
+<line x1="446" y1="116" x2="414" y2="190" stroke="black" />
+<line x1="446" y1="116" x2="446" y2="190" stroke="black" />
+<line x1="446" y1="116" x2="478" y2="190" stroke="black" />
+<line x1="478" y1="116" x2="382" y2="190" stroke="black" />
+<line x1="478" y1="116" x2="414" y2="190" stroke="black" />
+<line x1="478" y1="116" x2="446" y2="190" stroke="black" />
+<line x1="478" y1="116" x2="478" y2="190" stroke="black" />
+<line x1="510" y1="116" x2="510" y2="190" stroke="black" />
+<line x1="510" y1="116" x2="542" y2="190" stroke="black" />
+<line x1="510" y1="116" x2="574" y2="190" stroke="black" />
+<line x1="510" y1="116" x2="606" y2="190" stroke="black" />
+<line x1="542" y1="116" x2="510" y2="190" stroke="black" />
+<line x1="542" y1="116" x2="542" y2="190" stroke="black" />
+<line x1="542" y1="116" x2="574" y2="190" stroke="black" />
+<line x1="542" y1="116" x2="606" y2="190" stroke="black" />
+<line x1="574" y1="116" x2="510" y2="190" stroke="black" />
+<line x1="574" y1="116" x2="542" y2="190" stroke="black" />
+<line x1="574" y1="116" x2="574" y2="190" stroke="black" />
+<line x1="574" y1="116" x2="606" y2="190" stroke="black" />
+<line x1="606" y1="116" x2="510" y2="190" stroke="black" />
+<line x1="606" y1="116" x2="542" y2="190" stroke="black" />
+<line x1="606" y1="116" x2="574" y2="190" stroke="black" />
+<line x1="606" y1="116" x2="606" y2="190" stroke="black" />
+</svg>
diff --git a/diagram/xvshuf4i_w.svg b/diagram/xvshuf4i_w.svg
new file mode 100644
index 00000000..f8d4ca15
--- /dev/null
+++ b/diagram/xvshuf4i_w.svg
@@ -0,0 +1,68 @@
+<svg version="1.1"
+     width="670" height="250"
+     xmlns="http://www.w3.org/2000/svg">
+<text x="20" y="100">(1)</text>
+<text x="100" y="100" text-anchor="end" font-style="italic">a</text>
+<text x="100" y="115" text-anchor="end">data</text>
+<rect x="110" y="90" width="64" height="26" fill="white" stroke="blue" />
+<text x="142" y="103" text-anchor="middle" dominant-baseline="middle">3</text>
+<rect x="174" y="90" width="64" height="26" fill="white" stroke="blue" />
+<text x="206" y="103" text-anchor="middle" dominant-baseline="middle">2</text>
+<rect x="238" y="90" width="64" height="26" fill="white" stroke="blue" />
+<text x="270" y="103" text-anchor="middle" dominant-baseline="middle">1</text>
+<rect x="302" y="90" width="64" height="26" fill="white" stroke="blue" />
+<text x="334" y="103" text-anchor="middle" dominant-baseline="middle">0</text>
+<rect x="366" y="90" width="64" height="26" fill="white" stroke="blue" />
+<text x="398" y="103" text-anchor="middle" dominant-baseline="middle">3</text>
+<rect x="430" y="90" width="64" height="26" fill="white" stroke="blue" />
+<text x="462" y="103" text-anchor="middle" dominant-baseline="middle">2</text>
+<rect x="494" y="90" width="64" height="26" fill="white" stroke="blue" />
+<text x="526" y="103" text-anchor="middle" dominant-baseline="middle">1</text>
+<rect x="558" y="90" width="64" height="26" fill="white" stroke="blue" />
+<text x="590" y="103" text-anchor="middle" dominant-baseline="middle">0</text>
+<text x="110" y="80" text-anchor="begin">upper</text>
+<text x="622" y="80" text-anchor="end">lower</text>
+<text x="20" y="200">(2)</text>
+<text x="100" y="200" text-anchor="end" font-style="italic">ret</text>
+<text x="100" y="215" text-anchor="end">returns</text>
+<rect x="110" y="190" width="64" height="26" fill="white" stroke="blue" />
+<rect x="174" y="190" width="64" height="26" fill="white" stroke="blue" />
+<rect x="238" y="190" width="64" height="26" fill="white" stroke="blue" />
+<rect x="302" y="190" width="64" height="26" fill="white" stroke="blue" />
+<rect x="366" y="190" width="64" height="26" fill="white" stroke="blue" />
+<rect x="430" y="190" width="64" height="26" fill="white" stroke="blue" />
+<rect x="494" y="190" width="64" height="26" fill="white" stroke="blue" />
+<rect x="558" y="190" width="64" height="26" fill="white" stroke="blue" />
+<line x1="142" y1="116" x2="142" y2="190" stroke="black" />
+<line x1="142" y1="116" x2="206" y2="190" stroke="black" />
+<line x1="142" y1="116" x2="270" y2="190" stroke="black" />
+<line x1="142" y1="116" x2="334" y2="190" stroke="black" />
+<line x1="206" y1="116" x2="142" y2="190" stroke="black" />
+<line x1="206" y1="116" x2="206" y2="190" stroke="black" />
+<line x1="206" y1="116" x2="270" y2="190" stroke="black" />
+<line x1="206" y1="116" x2="334" y2="190" stroke="black" />
+<line x1="270" y1="116" x2="142" y2="190" stroke="black" />
+<line x1="270" y1="116" x2="206" y2="190" stroke="black" />
+<line x1="270" y1="116" x2="270" y2="190" stroke="black" />
+<line x1="270" y1="116" x2="334" y2="190" stroke="black" />
+<line x1="334" y1="116" x2="142" y2="190" stroke="black" />
+<line x1="334" y1="116" x2="206" y2="190" stroke="black" />
+<line x1="334" y1="116" x2="270" y2="190" stroke="black" />
+<line x1="334" y1="116" x2="334" y2="190" stroke="black" />
+<line x1="398" y1="116" x2="398" y2="190" stroke="black" />
+<line x1="398" y1="116" x2="462" y2="190" stroke="black" />
+<line x1="398" y1="116" x2="526" y2="190" stroke="black" />
+<line x1="398" y1="116" x2="590" y2="190" stroke="black" />
+<line x1="462" y1="116" x2="398" y2="190" stroke="black" />
+<line x1="462" y1="116" x2="462" y2="190" stroke="black" />
+<line x1="462" y1="116" x2="526" y2="190" stroke="black" />
+<line x1="462" y1="116" x2="590" y2="190" stroke="black" />
+<line x1="526" y1="116" x2="398" y2="190" stroke="black" />
+<line x1="526" y1="116" x2="462" y2="190" stroke="black" />
+<line x1="526" y1="116" x2="526" y2="190" stroke="black" />
+<line x1="526" y1="116" x2="590" y2="190" stroke="black" />
+<line x1="590" y1="116" x2="398" y2="190" stroke="black" />
+<line x1="590" y1="116" x2="462" y2="190" stroke="black" />
+<line x1="590" y1="116" x2="526" y2="190" stroke="black" />
+<line x1="590" y1="116" x2="590" y2="190" stroke="black" />
+</svg>
diff --git a/diagram/xvshuf_b.svg b/diagram/xvshuf_b.svg
new file mode 100644
index 00000000..9dce3330
--- /dev/null
+++ b/diagram/xvshuf_b.svg
@@ -0,0 +1,1464 @@
+<svg version="1.1"
+     width="1240" height="450"
+     xmlns="http://www.w3.org/2000/svg">
+<text x="20" y="100">(1)</text>
+<text x="100" y="100" text-anchor="end" font-style="italic">a</text>
+<text x="100" y="115" text-anchor="end">data</text>
+<rect x="110" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="118" y="103" text-anchor="middle" dominant-baseline="middle">31</text>
+<rect x="126" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="134" y="103" text-anchor="middle" dominant-baseline="middle">30</text>
+<rect x="142" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="150" y="103" text-anchor="middle" dominant-baseline="middle">29</text>
+<rect x="158" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="166" y="103" text-anchor="middle" dominant-baseline="middle">28</text>
+<rect x="174" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="182" y="103" text-anchor="middle" dominant-baseline="middle">27</text>
+<rect x="190" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="198" y="103" text-anchor="middle" dominant-baseline="middle">26</text>
+<rect x="206" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="214" y="103" text-anchor="middle" dominant-baseline="middle">25</text>
+<rect x="222" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="230" y="103" text-anchor="middle" dominant-baseline="middle">24</text>
+<rect x="238" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="246" y="103" text-anchor="middle" dominant-baseline="middle">23</text>
+<rect x="254" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="262" y="103" text-anchor="middle" dominant-baseline="middle">22</text>
+<rect x="270" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="278" y="103" text-anchor="middle" dominant-baseline="middle">21</text>
+<rect x="286" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="294" y="103" text-anchor="middle" dominant-baseline="middle">20</text>
+<rect x="302" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="310" y="103" text-anchor="middle" dominant-baseline="middle">19</text>
+<rect x="318" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="326" y="103" text-anchor="middle" dominant-baseline="middle">18</text>
+<rect x="334" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="342" y="103" text-anchor="middle" dominant-baseline="middle">17</text>
+<rect x="350" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="358" y="103" text-anchor="middle" dominant-baseline="middle">16</text>
+<rect x="366" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="374" y="103" text-anchor="middle" dominant-baseline="middle">31</text>
+<rect x="382" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="390" y="103" text-anchor="middle" dominant-baseline="middle">30</text>
+<rect x="398" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="406" y="103" text-anchor="middle" dominant-baseline="middle">29</text>
+<rect x="414" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="422" y="103" text-anchor="middle" dominant-baseline="middle">28</text>
+<rect x="430" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="438" y="103" text-anchor="middle" dominant-baseline="middle">27</text>
+<rect x="446" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="454" y="103" text-anchor="middle" dominant-baseline="middle">26</text>
+<rect x="462" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="470" y="103" text-anchor="middle" dominant-baseline="middle">25</text>
+<rect x="478" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="486" y="103" text-anchor="middle" dominant-baseline="middle">24</text>
+<rect x="494" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="502" y="103" text-anchor="middle" dominant-baseline="middle">23</text>
+<rect x="510" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="518" y="103" text-anchor="middle" dominant-baseline="middle">22</text>
+<rect x="526" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="534" y="103" text-anchor="middle" dominant-baseline="middle">21</text>
+<rect x="542" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="550" y="103" text-anchor="middle" dominant-baseline="middle">20</text>
+<rect x="558" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="566" y="103" text-anchor="middle" dominant-baseline="middle">19</text>
+<rect x="574" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="582" y="103" text-anchor="middle" dominant-baseline="middle">18</text>
+<rect x="590" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="598" y="103" text-anchor="middle" dominant-baseline="middle">17</text>
+<rect x="606" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="614" y="103" text-anchor="middle" dominant-baseline="middle">16</text>
+<text x="110" y="80" text-anchor="begin">upper</text>
+<text x="622" y="80" text-anchor="end">lower</text>
+<text x="670" y="100" text-anchor="end" font-style="italic">b</text>
+<text x="670" y="115" text-anchor="end">data</text>
+<rect x="680" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="688" y="103" text-anchor="middle" dominant-baseline="middle">15</text>
+<rect x="696" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="704" y="103" text-anchor="middle" dominant-baseline="middle">14</text>
+<rect x="712" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="720" y="103" text-anchor="middle" dominant-baseline="middle">13</text>
+<rect x="728" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="736" y="103" text-anchor="middle" dominant-baseline="middle">12</text>
+<rect x="744" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="752" y="103" text-anchor="middle" dominant-baseline="middle">11</text>
+<rect x="760" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="768" y="103" text-anchor="middle" dominant-baseline="middle">10</text>
+<rect x="776" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="784" y="103" text-anchor="middle" dominant-baseline="middle">9</text>
+<rect x="792" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="800" y="103" text-anchor="middle" dominant-baseline="middle">8</text>
+<rect x="808" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="816" y="103" text-anchor="middle" dominant-baseline="middle">7</text>
+<rect x="824" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="832" y="103" text-anchor="middle" dominant-baseline="middle">6</text>
+<rect x="840" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="848" y="103" text-anchor="middle" dominant-baseline="middle">5</text>
+<rect x="856" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="864" y="103" text-anchor="middle" dominant-baseline="middle">4</text>
+<rect x="872" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="880" y="103" text-anchor="middle" dominant-baseline="middle">3</text>
+<rect x="888" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="896" y="103" text-anchor="middle" dominant-baseline="middle">2</text>
+<rect x="904" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="912" y="103" text-anchor="middle" dominant-baseline="middle">1</text>
+<rect x="920" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="928" y="103" text-anchor="middle" dominant-baseline="middle">0</text>
+<rect x="936" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="944" y="103" text-anchor="middle" dominant-baseline="middle">15</text>
+<rect x="952" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="960" y="103" text-anchor="middle" dominant-baseline="middle">14</text>
+<rect x="968" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="976" y="103" text-anchor="middle" dominant-baseline="middle">13</text>
+<rect x="984" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="992" y="103" text-anchor="middle" dominant-baseline="middle">12</text>
+<rect x="1000" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="1008" y="103" text-anchor="middle" dominant-baseline="middle">11</text>
+<rect x="1016" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="1024" y="103" text-anchor="middle" dominant-baseline="middle">10</text>
+<rect x="1032" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="1040" y="103" text-anchor="middle" dominant-baseline="middle">9</text>
+<rect x="1048" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="1056" y="103" text-anchor="middle" dominant-baseline="middle">8</text>
+<rect x="1064" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="1072" y="103" text-anchor="middle" dominant-baseline="middle">7</text>
+<rect x="1080" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="1088" y="103" text-anchor="middle" dominant-baseline="middle">6</text>
+<rect x="1096" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="1104" y="103" text-anchor="middle" dominant-baseline="middle">5</text>
+<rect x="1112" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="1120" y="103" text-anchor="middle" dominant-baseline="middle">4</text>
+<rect x="1128" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="1136" y="103" text-anchor="middle" dominant-baseline="middle">3</text>
+<rect x="1144" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="1152" y="103" text-anchor="middle" dominant-baseline="middle">2</text>
+<rect x="1160" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="1168" y="103" text-anchor="middle" dominant-baseline="middle">1</text>
+<rect x="1176" y="90" width="16" height="26" fill="white" stroke="blue" />
+<text x="1184" y="103" text-anchor="middle" dominant-baseline="middle">0</text>
+<text x="680" y="80" text-anchor="begin">upper</text>
+<text x="1192" y="80" text-anchor="end">lower</text>
+<text x="20" y="200">(2)</text>
+<text x="100" y="200" text-anchor="end" font-style="italic">hi</text>
+<text x="100" y="215" text-anchor="end">merged</text>
+<rect x="110" y="190" width="16" height="26" fill="white" stroke="blue" />
+<text x="118" y="203" text-anchor="middle" dominant-baseline="middle">31</text>
+<rect x="126" y="190" width="16" height="26" fill="white" stroke="blue" />
+<text x="134" y="203" text-anchor="middle" dominant-baseline="middle">30</text>
+<rect x="142" y="190" width="16" height="26" fill="white" stroke="blue" />
+<text x="150" y="203" text-anchor="middle" dominant-baseline="middle">29</text>
+<rect x="158" y="190" width="16" height="26" fill="white" stroke="blue" />
+<text x="166" y="203" text-anchor="middle" dominant-baseline="middle">28</text>
+<rect x="174" y="190" width="16" height="26" fill="white" stroke="blue" />
+<text x="182" y="203" text-anchor="middle" dominant-baseline="middle">27</text>
+<rect x="190" y="190" width="16" height="26" fill="white" stroke="blue" />
+<text x="198" y="203" text-anchor="middle" dominant-baseline="middle">26</text>
+<rect x="206" y="190" width="16" height="26" fill="white" stroke="blue" />
+<text x="214" y="203" text-anchor="middle" dominant-baseline="middle">25</text>
+<rect x="222" y="190" width="16" height="26" fill="white" stroke="blue" />
+<text x="230" y="203" text-anchor="middle" dominant-baseline="middle">24</text>
+<rect x="238" y="190" width="16" height="26" fill="white" stroke="blue" />
+<text x="246" y="203" text-anchor="middle" dominant-baseline="middle">23</text>
+<rect x="254" y="190" width="16" height="26" fill="white" stroke="blue" />
+<text x="262" y="203" text-anchor="middle" dominant-baseline="middle">22</text>
+<rect x="270" y="190" width="16" height="26" fill="white" stroke="blue" />
+<text x="278" y="203" text-anchor="middle" dominant-baseline="middle">21</text>
+<rect x="286" y="190" width="16" height="26" fill="white" stroke="blue" />
+<text x="294" y="203" text-anchor="middle" dominant-baseline="middle">20</text>
+<rect x="302" y="190" width="16" height="26" fill="white" stroke="blue" />
+<text x="310" y="203" text-anchor="middle" dominant-baseline="middle">19</text>
+<rect x="318" y="190" width="16" height="26" fill="white" stroke="blue" />
+<text x="326" y="203" text-anchor="middle" dominant-baseline="middle">18</text>
+<rect x="334" y="190" width="16" height="26" fill="white" stroke="blue" />
+<text x="342" y="203" text-anchor="middle" dominant-baseline="middle">17</text>
+<rect x="350" y="190" width="16" height="26" fill="white" stroke="blue" />
+<text x="358" y="203" text-anchor="middle" dominant-baseline="middle">16</text>
+<rect x="366" y="190" width="16" height="26" fill="white" stroke="blue" />
+<text x="374" y="203" text-anchor="middle" dominant-baseline="middle">15</text>
+<rect x="382" y="190" width="16" height="26" fill="white" stroke="blue" />
+<text x="390" y="203" text-anchor="middle" dominant-baseline="middle">14</text>
+<rect x="398" y="190" width="16" height="26" fill="white" stroke="blue" />
+<text x="406" y="203" text-anchor="middle" dominant-baseline="middle">13</text>
+<rect x="414" y="190" width="16" height="26" fill="white" stroke="blue" />
+<text x="422" y="203" text-anchor="middle" dominant-baseline="middle">12</text>
+<rect x="430" y="190" width="16" height="26" fill="white" stroke="blue" />
+<text x="438" y="203" text-anchor="middle" dominant-baseline="middle">11</text>
+<rect x="446" y="190" width="16" height="26" fill="white" stroke="blue" />
+<text x="454" y="203" text-anchor="middle" dominant-baseline="middle">10</text>
+<rect x="462" y="190" width="16" height="26" fill="white" stroke="blue" />
+<text x="470" y="203" text-anchor="middle" dominant-baseline="middle">9</text>
+<rect x="478" y="190" width="16" height="26" fill="white" stroke="blue" />
+<text x="486" y="203" text-anchor="middle" dominant-baseline="middle">8</text>
+<rect x="494" y="190" width="16" height="26" fill="white" stroke="blue" />
+<text x="502" y="203" text-anchor="middle" dominant-baseline="middle">7</text>
+<rect x="510" y="190" width="16" height="26" fill="white" stroke="blue" />
+<text x="518" y="203" text-anchor="middle" dominant-baseline="middle">6</text>
+<rect x="526" y="190" width="16" height="26" fill="white" stroke="blue" />
+<text x="534" y="203" text-anchor="middle" dominant-baseline="middle">5</text>
+<rect x="542" y="190" width="16" height="26" fill="white" stroke="blue" />
+<text x="550" y="203" text-anchor="middle" dominant-baseline="middle">4</text>
+<rect x="558" y="190" width="16" height="26" fill="white" stroke="blue" />
+<text x="566" y="203" text-anchor="middle" dominant-baseline="middle">3</text>
+<rect x="574" y="190" width="16" height="26" fill="white" stroke="blue" />
+<text x="582" y="203" text-anchor="middle" dominant-baseline="middle">2</text>
+<rect x="590" y="190" width="16" height="26" fill="white" stroke="blue" />
+<text x="598" y="203" text-anchor="middle" dominant-baseline="middle">1</text>
+<rect x="606" y="190" width="16" height="26" fill="white" stroke="blue" />
+<text x="614" y="203" text-anchor="middle" dominant-baseline="middle">0</text>
+<text x="670" y="200" text-anchor="end" font-style="italic">lo</text>
+<text x="670" y="215" text-anchor="end">merged</text>
+<rect x="680" y="190" width="16" height="26" fill="white" stroke="blue" />
+<text x="688" y="203" text-anchor="middle" dominant-baseline="middle">31</text>
+<rect x="696" y="190" width="16" height="26" fill="white" stroke="blue" />
+<text x="704" y="203" text-anchor="middle" dominant-baseline="middle">30</text>
+<rect x="712" y="190" width="16" height="26" fill="white" stroke="blue" />
+<text x="720" y="203" text-anchor="middle" dominant-baseline="middle">29</text>
+<rect x="728" y="190" width="16" height="26" fill="white" stroke="blue" />
+<text x="736" y="203" text-anchor="middle" dominant-baseline="middle">28</text>
+<rect x="744" y="190" width="16" height="26" fill="white" stroke="blue" />
+<text x="752" y="203" text-anchor="middle" dominant-baseline="middle">27</text>
+<rect x="760" y="190" width="16" height="26" fill="white" stroke="blue" />
+<text x="768" y="203" text-anchor="middle" dominant-baseline="middle">26</text>
+<rect x="776" y="190" width="16" height="26" fill="white" stroke="blue" />
+<text x="784" y="203" text-anchor="middle" dominant-baseline="middle">25</text>
+<rect x="792" y="190" width="16" height="26" fill="white" stroke="blue" />
+<text x="800" y="203" text-anchor="middle" dominant-baseline="middle">24</text>
+<rect x="808" y="190" width="16" height="26" fill="white" stroke="blue" />
+<text x="816" y="203" text-anchor="middle" dominant-baseline="middle">23</text>
+<rect x="824" y="190" width="16" height="26" fill="white" stroke="blue" />
+<text x="832" y="203" text-anchor="middle" dominant-baseline="middle">22</text>
+<rect x="840" y="190" width="16" height="26" fill="white" stroke="blue" />
+<text x="848" y="203" text-anchor="middle" dominant-baseline="middle">21</text>
+<rect x="856" y="190" width="16" height="26" fill="white" stroke="blue" />
+<text x="864" y="203" text-anchor="middle" dominant-baseline="middle">20</text>
+<rect x="872" y="190" width="16" height="26" fill="white" stroke="blue" />
+<text x="880" y="203" text-anchor="middle" dominant-baseline="middle">19</text>
+<rect x="888" y="190" width="16" height="26" fill="white" stroke="blue" />
+<text x="896" y="203" text-anchor="middle" dominant-baseline="middle">18</text>
+<rect x="904" y="190" width="16" height="26" fill="white" stroke="blue" />
+<text x="912" y="203" text-anchor="middle" dominant-baseline="middle">17</text>
+<rect x="920" y="190" width="16" height="26" fill="white" stroke="blue" />
+<text x="928" y="203" text-anchor="middle" dominant-baseline="middle">16</text>
+<rect x="936" y="190" width="16" height="26" fill="white" stroke="blue" />
+<text x="944" y="203" text-anchor="middle" dominant-baseline="middle">15</text>
+<rect x="952" y="190" width="16" height="26" fill="white" stroke="blue" />
+<text x="960" y="203" text-anchor="middle" dominant-baseline="middle">14</text>
+<rect x="968" y="190" width="16" height="26" fill="white" stroke="blue" />
+<text x="976" y="203" text-anchor="middle" dominant-baseline="middle">13</text>
+<rect x="984" y="190" width="16" height="26" fill="white" stroke="blue" />
+<text x="992" y="203" text-anchor="middle" dominant-baseline="middle">12</text>
+<rect x="1000" y="190" width="16" height="26" fill="white" stroke="blue" />
+<text x="1008" y="203" text-anchor="middle" dominant-baseline="middle">11</text>
+<rect x="1016" y="190" width="16" height="26" fill="white" stroke="blue" />
+<text x="1024" y="203" text-anchor="middle" dominant-baseline="middle">10</text>
+<rect x="1032" y="190" width="16" height="26" fill="white" stroke="blue" />
+<text x="1040" y="203" text-anchor="middle" dominant-baseline="middle">9</text>
+<rect x="1048" y="190" width="16" height="26" fill="white" stroke="blue" />
+<text x="1056" y="203" text-anchor="middle" dominant-baseline="middle">8</text>
+<rect x="1064" y="190" width="16" height="26" fill="white" stroke="blue" />
+<text x="1072" y="203" text-anchor="middle" dominant-baseline="middle">7</text>
+<rect x="1080" y="190" width="16" height="26" fill="white" stroke="blue" />
+<text x="1088" y="203" text-anchor="middle" dominant-baseline="middle">6</text>
+<rect x="1096" y="190" width="16" height="26" fill="white" stroke="blue" />
+<text x="1104" y="203" text-anchor="middle" dominant-baseline="middle">5</text>
+<rect x="1112" y="190" width="16" height="26" fill="white" stroke="blue" />
+<text x="1120" y="203" text-anchor="middle" dominant-baseline="middle">4</text>
+<rect x="1128" y="190" width="16" height="26" fill="white" stroke="blue" />
+<text x="1136" y="203" text-anchor="middle" dominant-baseline="middle">3</text>
+<rect x="1144" y="190" width="16" height="26" fill="white" stroke="blue" />
+<text x="1152" y="203" text-anchor="middle" dominant-baseline="middle">2</text>
+<rect x="1160" y="190" width="16" height="26" fill="white" stroke="blue" />
+<text x="1168" y="203" text-anchor="middle" dominant-baseline="middle">1</text>
+<rect x="1176" y="190" width="16" height="26" fill="white" stroke="blue" />
+<text x="1184" y="203" text-anchor="middle" dominant-baseline="middle">0</text>
+<text x="20" y="300">(3)</text>
+<text x="100" y="300" text-anchor="end" font-style="italic">c</text>
+<text x="100" y="315" text-anchor="end">indices</text>
+<rect x="110" y="290" width="16" height="26" fill="white" stroke="blue" />
+<rect x="126" y="290" width="16" height="26" fill="white" stroke="blue" />
+<rect x="142" y="290" width="16" height="26" fill="white" stroke="blue" />
+<rect x="158" y="290" width="16" height="26" fill="white" stroke="blue" />
+<rect x="174" y="290" width="16" height="26" fill="white" stroke="blue" />
+<rect x="190" y="290" width="16" height="26" fill="white" stroke="blue" />
+<rect x="206" y="290" width="16" height="26" fill="white" stroke="blue" />
+<rect x="222" y="290" width="16" height="26" fill="white" stroke="blue" />
+<rect x="238" y="290" width="16" height="26" fill="white" stroke="blue" />
+<rect x="254" y="290" width="16" height="26" fill="white" stroke="blue" />
+<rect x="270" y="290" width="16" height="26" fill="white" stroke="blue" />
+<rect x="286" y="290" width="16" height="26" fill="white" stroke="blue" />
+<rect x="302" y="290" width="16" height="26" fill="white" stroke="blue" />
+<rect x="318" y="290" width="16" height="26" fill="white" stroke="blue" />
+<rect x="334" y="290" width="16" height="26" fill="white" stroke="blue" />
+<rect x="350" y="290" width="16" height="26" fill="white" stroke="blue" />
+<rect x="366" y="290" width="16" height="26" fill="white" stroke="blue" />
+<rect x="382" y="290" width="16" height="26" fill="white" stroke="blue" />
+<rect x="398" y="290" width="16" height="26" fill="white" stroke="blue" />
+<rect x="414" y="290" width="16" height="26" fill="white" stroke="blue" />
+<rect x="430" y="290" width="16" height="26" fill="white" stroke="blue" />
+<rect x="446" y="290" width="16" height="26" fill="white" stroke="blue" />
+<rect x="462" y="290" width="16" height="26" fill="white" stroke="blue" />
+<rect x="478" y="290" width="16" height="26" fill="white" stroke="blue" />
+<rect x="494" y="290" width="16" height="26" fill="white" stroke="blue" />
+<rect x="510" y="290" width="16" height="26" fill="white" stroke="blue" />
+<rect x="526" y="290" width="16" height="26" fill="white" stroke="blue" />
+<rect x="542" y="290" width="16" height="26" fill="white" stroke="blue" />
+<rect x="558" y="290" width="16" height="26" fill="white" stroke="blue" />
+<rect x="574" y="290" width="16" height="26" fill="white" stroke="blue" />
+<rect x="590" y="290" width="16" height="26" fill="white" stroke="blue" />
+<rect x="606" y="290" width="16" height="26" fill="white" stroke="blue" />
+<text x="20" y="400">(4)</text>
+<text x="100" y="400" text-anchor="end" font-style="italic">ret</text>
+<text x="100" y="415" text-anchor="end">returns</text>
+<rect x="110" y="390" width="16" height="26" fill="white" stroke="blue" />
+<rect x="126" y="390" width="16" height="26" fill="white" stroke="blue" />
+<rect x="142" y="390" width="16" height="26" fill="white" stroke="blue" />
+<rect x="158" y="390" width="16" height="26" fill="white" stroke="blue" />
+<rect x="174" y="390" width="16" height="26" fill="white" stroke="blue" />
+<rect x="190" y="390" width="16" height="26" fill="white" stroke="blue" />
+<rect x="206" y="390" width="16" height="26" fill="white" stroke="blue" />
+<rect x="222" y="390" width="16" height="26" fill="white" stroke="blue" />
+<rect x="238" y="390" width="16" height="26" fill="white" stroke="blue" />
+<rect x="254" y="390" width="16" height="26" fill="white" stroke="blue" />
+<rect x="270" y="390" width="16" height="26" fill="white" stroke="blue" />
+<rect x="286" y="390" width="16" height="26" fill="white" stroke="blue" />
+<rect x="302" y="390" width="16" height="26" fill="white" stroke="blue" />
+<rect x="318" y="390" width="16" height="26" fill="white" stroke="blue" />
+<rect x="334" y="390" width="16" height="26" fill="white" stroke="blue" />
+<rect x="350" y="390" width="16" height="26" fill="white" stroke="blue" />
+<rect x="366" y="390" width="16" height="26" fill="white" stroke="blue" />
+<rect x="382" y="390" width="16" height="26" fill="white" stroke="blue" />
+<rect x="398" y="390" width="16" height="26" fill="white" stroke="blue" />
+<rect x="414" y="390" width="16" height="26" fill="white" stroke="blue" />
+<rect x="430" y="390" width="16" height="26" fill="white" stroke="blue" />
+<rect x="446" y="390" width="16" height="26" fill="white" stroke="blue" />
+<rect x="462" y="390" width="16" height="26" fill="white" stroke="blue" />
+<rect x="478" y="390" width="16" height="26" fill="white" stroke="blue" />
+<rect x="494" y="390" width="16" height="26" fill="white" stroke="blue" />
+<rect x="510" y="390" width="16" height="26" fill="white" stroke="blue" />
+<rect x="526" y="390" width="16" height="26" fill="white" stroke="blue" />
+<rect x="542" y="390" width="16" height="26" fill="white" stroke="blue" />
+<rect x="558" y="390" width="16" height="26" fill="white" stroke="blue" />
+<rect x="574" y="390" width="16" height="26" fill="white" stroke="blue" />
+<rect x="590" y="390" width="16" height="26" fill="white" stroke="blue" />
+<rect x="606" y="390" width="16" height="26" fill="white" stroke="blue" />
+<line x1="118" y1="116" x2="118" y2="190" stroke="black" />
+<line x1="374" y1="116" x2="688" y2="190" stroke="black" />
+<line x1="134" y1="116" x2="134" y2="190" stroke="black" />
+<line x1="390" y1="116" x2="704" y2="190" stroke="black" />
+<line x1="150" y1="116" x2="150" y2="190" stroke="black" />
+<line x1="406" y1="116" x2="720" y2="190" stroke="black" />
+<line x1="166" y1="116" x2="166" y2="190" stroke="black" />
+<line x1="422" y1="116" x2="736" y2="190" stroke="black" />
+<line x1="182" y1="116" x2="182" y2="190" stroke="black" />
+<line x1="438" y1="116" x2="752" y2="190" stroke="black" />
+<line x1="198" y1="116" x2="198" y2="190" stroke="black" />
+<line x1="454" y1="116" x2="768" y2="190" stroke="black" />
+<line x1="214" y1="116" x2="214" y2="190" stroke="black" />
+<line x1="470" y1="116" x2="784" y2="190" stroke="black" />
+<line x1="230" y1="116" x2="230" y2="190" stroke="black" />
+<line x1="486" y1="116" x2="800" y2="190" stroke="black" />
+<line x1="246" y1="116" x2="246" y2="190" stroke="black" />
+<line x1="502" y1="116" x2="816" y2="190" stroke="black" />
+<line x1="262" y1="116" x2="262" y2="190" stroke="black" />
+<line x1="518" y1="116" x2="832" y2="190" stroke="black" />
+<line x1="278" y1="116" x2="278" y2="190" stroke="black" />
+<line x1="534" y1="116" x2="848" y2="190" stroke="black" />
+<line x1="294" y1="116" x2="294" y2="190" stroke="black" />
+<line x1="550" y1="116" x2="864" y2="190" stroke="black" />
+<line x1="310" y1="116" x2="310" y2="190" stroke="black" />
+<line x1="566" y1="116" x2="880" y2="190" stroke="black" />
+<line x1="326" y1="116" x2="326" y2="190" stroke="black" />
+<line x1="582" y1="116" x2="896" y2="190" stroke="black" />
+<line x1="342" y1="116" x2="342" y2="190" stroke="black" />
+<line x1="598" y1="116" x2="912" y2="190" stroke="black" />
+<line x1="358" y1="116" x2="358" y2="190" stroke="black" />
+<line x1="614" y1="116" x2="928" y2="190" stroke="black" />
+<line x1="688" y1="116" x2="374" y2="190" stroke="black" />
+<line x1="944" y1="116" x2="944" y2="190" stroke="black" />
+<line x1="704" y1="116" x2="390" y2="190" stroke="black" />
+<line x1="960" y1="116" x2="960" y2="190" stroke="black" />
+<line x1="720" y1="116" x2="406" y2="190" stroke="black" />
+<line x1="976" y1="116" x2="976" y2="190" stroke="black" />
+<line x1="736" y1="116" x2="422" y2="190" stroke="black" />
+<line x1="992" y1="116" x2="992" y2="190" stroke="black" />
+<line x1="752" y1="116" x2="438" y2="190" stroke="black" />
+<line x1="1008" y1="116" x2="1008" y2="190" stroke="black" />
+<line x1="768" y1="116" x2="454" y2="190" stroke="black" />
+<line x1="1024" y1="116" x2="1024" y2="190" stroke="black" />
+<line x1="784" y1="116" x2="470" y2="190" stroke="black" />
+<line x1="1040" y1="116" x2="1040" y2="190" stroke="black" />
+<line x1="800" y1="116" x2="486" y2="190" stroke="black" />
+<line x1="1056" y1="116" x2="1056" y2="190" stroke="black" />
+<line x1="816" y1="116" x2="502" y2="190" stroke="black" />
+<line x1="1072" y1="116" x2="1072" y2="190" stroke="black" />
+<line x1="832" y1="116" x2="518" y2="190" stroke="black" />
+<line x1="1088" y1="116" x2="1088" y2="190" stroke="black" />
+<line x1="848" y1="116" x2="534" y2="190" stroke="black" />
+<line x1="1104" y1="116" x2="1104" y2="190" stroke="black" />
+<line x1="864" y1="116" x2="550" y2="190" stroke="black" />
+<line x1="1120" y1="116" x2="1120" y2="190" stroke="black" />
+<line x1="880" y1="116" x2="566" y2="190" stroke="black" />
+<line x1="1136" y1="116" x2="1136" y2="190" stroke="black" />
+<line x1="896" y1="116" x2="582" y2="190" stroke="black" />
+<line x1="1152" y1="116" x2="1152" y2="190" stroke="black" />
+<line x1="912" y1="116" x2="598" y2="190" stroke="black" />
+<line x1="1168" y1="116" x2="1168" y2="190" stroke="black" />
+<line x1="928" y1="116" x2="614" y2="190" stroke="black" />
+<line x1="1184" y1="116" x2="1184" y2="190" stroke="black" />
+<line x1="118" y1="216" x2="118" y2="290" stroke="black" />
+<line x1="688" y1="216" x2="374" y2="290" stroke="black" />
+<line x1="118" y1="216" x2="134" y2="290" stroke="black" />
+<line x1="688" y1="216" x2="390" y2="290" stroke="black" />
+<line x1="118" y1="216" x2="150" y2="290" stroke="black" />
+<line x1="688" y1="216" x2="406" y2="290" stroke="black" />
+<line x1="118" y1="216" x2="166" y2="290" stroke="black" />
+<line x1="688" y1="216" x2="422" y2="290" stroke="black" />
+<line x1="118" y1="216" x2="182" y2="290" stroke="black" />
+<line x1="688" y1="216" x2="438" y2="290" stroke="black" />
+<line x1="118" y1="216" x2="198" y2="290" stroke="black" />
+<line x1="688" y1="216" x2="454" y2="290" stroke="black" />
+<line x1="118" y1="216" x2="214" y2="290" stroke="black" />
+<line x1="688" y1="216" x2="470" y2="290" stroke="black" />
+<line x1="118" y1="216" x2="230" y2="290" stroke="black" />
+<line x1="688" y1="216" x2="486" y2="290" stroke="black" />
+<line x1="118" y1="216" x2="246" y2="290" stroke="black" />
+<line x1="688" y1="216" x2="502" y2="290" stroke="black" />
+<line x1="118" y1="216" x2="262" y2="290" stroke="black" />
+<line x1="688" y1="216" x2="518" y2="290" stroke="black" />
+<line x1="118" y1="216" x2="278" y2="290" stroke="black" />
+<line x1="688" y1="216" x2="534" y2="290" stroke="black" />
+<line x1="118" y1="216" x2="294" y2="290" stroke="black" />
+<line x1="688" y1="216" x2="550" y2="290" stroke="black" />
+<line x1="118" y1="216" x2="310" y2="290" stroke="black" />
+<line x1="688" y1="216" x2="566" y2="290" stroke="black" />
+<line x1="118" y1="216" x2="326" y2="290" stroke="black" />
+<line x1="688" y1="216" x2="582" y2="290" stroke="black" />
+<line x1="118" y1="216" x2="342" y2="290" stroke="black" />
+<line x1="688" y1="216" x2="598" y2="290" stroke="black" />
+<line x1="118" y1="216" x2="358" y2="290" stroke="black" />
+<line x1="688" y1="216" x2="614" y2="290" stroke="black" />
+<line x1="134" y1="216" x2="118" y2="290" stroke="black" />
+<line x1="704" y1="216" x2="374" y2="290" stroke="black" />
+<line x1="134" y1="216" x2="134" y2="290" stroke="black" />
+<line x1="704" y1="216" x2="390" y2="290" stroke="black" />
+<line x1="134" y1="216" x2="150" y2="290" stroke="black" />
+<line x1="704" y1="216" x2="406" y2="290" stroke="black" />
+<line x1="134" y1="216" x2="166" y2="290" stroke="black" />
+<line x1="704" y1="216" x2="422" y2="290" stroke="black" />
+<line x1="134" y1="216" x2="182" y2="290" stroke="black" />
+<line x1="704" y1="216" x2="438" y2="290" stroke="black" />
+<line x1="134" y1="216" x2="198" y2="290" stroke="black" />
+<line x1="704" y1="216" x2="454" y2="290" stroke="black" />
+<line x1="134" y1="216" x2="214" y2="290" stroke="black" />
+<line x1="704" y1="216" x2="470" y2="290" stroke="black" />
+<line x1="134" y1="216" x2="230" y2="290" stroke="black" />
+<line x1="704" y1="216" x2="486" y2="290" stroke="black" />
+<line x1="134" y1="216" x2="246" y2="290" stroke="black" />
+<line x1="704" y1="216" x2="502" y2="290" stroke="black" />
+<line x1="134" y1="216" x2="262" y2="290" stroke="black" />
+<line x1="704" y1="216" x2="518" y2="290" stroke="black" />
+<line x1="134" y1="216" x2="278" y2="290" stroke="black" />
+<line x1="704" y1="216" x2="534" y2="290" stroke="black" />
+<line x1="134" y1="216" x2="294" y2="290" stroke="black" />
+<line x1="704" y1="216" x2="550" y2="290" stroke="black" />
+<line x1="134" y1="216" x2="310" y2="290" stroke="black" />
+<line x1="704" y1="216" x2="566" y2="290" stroke="black" />
+<line x1="134" y1="216" x2="326" y2="290" stroke="black" />
+<line x1="704" y1="216" x2="582" y2="290" stroke="black" />
+<line x1="134" y1="216" x2="342" y2="290" stroke="black" />
+<line x1="704" y1="216" x2="598" y2="290" stroke="black" />
+<line x1="134" y1="216" x2="358" y2="290" stroke="black" />
+<line x1="704" y1="216" x2="614" y2="290" stroke="black" />
+<line x1="150" y1="216" x2="118" y2="290" stroke="black" />
+<line x1="720" y1="216" x2="374" y2="290" stroke="black" />
+<line x1="150" y1="216" x2="134" y2="290" stroke="black" />
+<line x1="720" y1="216" x2="390" y2="290" stroke="black" />
+<line x1="150" y1="216" x2="150" y2="290" stroke="black" />
+<line x1="720" y1="216" x2="406" y2="290" stroke="black" />
+<line x1="150" y1="216" x2="166" y2="290" stroke="black" />
+<line x1="720" y1="216" x2="422" y2="290" stroke="black" />
+<line x1="150" y1="216" x2="182" y2="290" stroke="black" />
+<line x1="720" y1="216" x2="438" y2="290" stroke="black" />
+<line x1="150" y1="216" x2="198" y2="290" stroke="black" />
+<line x1="720" y1="216" x2="454" y2="290" stroke="black" />
+<line x1="150" y1="216" x2="214" y2="290" stroke="black" />
+<line x1="720" y1="216" x2="470" y2="290" stroke="black" />
+<line x1="150" y1="216" x2="230" y2="290" stroke="black" />
+<line x1="720" y1="216" x2="486" y2="290" stroke="black" />
+<line x1="150" y1="216" x2="246" y2="290" stroke="black" />
+<line x1="720" y1="216" x2="502" y2="290" stroke="black" />
+<line x1="150" y1="216" x2="262" y2="290" stroke="black" />
+<line x1="720" y1="216" x2="518" y2="290" stroke="black" />
+<line x1="150" y1="216" x2="278" y2="290" stroke="black" />
+<line x1="720" y1="216" x2="534" y2="290" stroke="black" />
+<line x1="150" y1="216" x2="294" y2="290" stroke="black" />
+<line x1="720" y1="216" x2="550" y2="290" stroke="black" />
+<line x1="150" y1="216" x2="310" y2="290" stroke="black" />
+<line x1="720" y1="216" x2="566" y2="290" stroke="black" />
+<line x1="150" y1="216" x2="326" y2="290" stroke="black" />
+<line x1="720" y1="216" x2="582" y2="290" stroke="black" />
+<line x1="150" y1="216" x2="342" y2="290" stroke="black" />
+<line x1="720" y1="216" x2="598" y2="290" stroke="black" />
+<line x1="150" y1="216" x2="358" y2="290" stroke="black" />
+<line x1="720" y1="216" x2="614" y2="290" stroke="black" />
+<line x1="166" y1="216" x2="118" y2="290" stroke="black" />
+<line x1="736" y1="216" x2="374" y2="290" stroke="black" />
+<line x1="166" y1="216" x2="134" y2="290" stroke="black" />
+<line x1="736" y1="216" x2="390" y2="290" stroke="black" />
+<line x1="166" y1="216" x2="150" y2="290" stroke="black" />
+<line x1="736" y1="216" x2="406" y2="290" stroke="black" />
+<line x1="166" y1="216" x2="166" y2="290" stroke="black" />
+<line x1="736" y1="216" x2="422" y2="290" stroke="black" />
+<line x1="166" y1="216" x2="182" y2="290" stroke="black" />
+<line x1="736" y1="216" x2="438" y2="290" stroke="black" />
+<line x1="166" y1="216" x2="198" y2="290" stroke="black" />
+<line x1="736" y1="216" x2="454" y2="290" stroke="black" />
+<line x1="166" y1="216" x2="214" y2="290" stroke="black" />
+<line x1="736" y1="216" x2="470" y2="290" stroke="black" />
+<line x1="166" y1="216" x2="230" y2="290" stroke="black" />
+<line x1="736" y1="216" x2="486" y2="290" stroke="black" />
+<line x1="166" y1="216" x2="246" y2="290" stroke="black" />
+<line x1="736" y1="216" x2="502" y2="290" stroke="black" />
+<line x1="166" y1="216" x2="262" y2="290" stroke="black" />
+<line x1="736" y1="216" x2="518" y2="290" stroke="black" />
+<line x1="166" y1="216" x2="278" y2="290" stroke="black" />
+<line x1="736" y1="216" x2="534" y2="290" stroke="black" />
+<line x1="166" y1="216" x2="294" y2="290" stroke="black" />
+<line x1="736" y1="216" x2="550" y2="290" stroke="black" />
+<line x1="166" y1="216" x2="310" y2="290" stroke="black" />
+<line x1="736" y1="216" x2="566" y2="290" stroke="black" />
+<line x1="166" y1="216" x2="326" y2="290" stroke="black" />
+<line x1="736" y1="216" x2="582" y2="290" stroke="black" />
+<line x1="166" y1="216" x2="342" y2="290" stroke="black" />
+<line x1="736" y1="216" x2="598" y2="290" stroke="black" />
+<line x1="166" y1="216" x2="358" y2="290" stroke="black" />
+<line x1="736" y1="216" x2="614" y2="290" stroke="black" />
+<line x1="182" y1="216" x2="118" y2="290" stroke="black" />
+<line x1="752" y1="216" x2="374" y2="290" stroke="black" />
+<line x1="182" y1="216" x2="134" y2="290" stroke="black" />
+<line x1="752" y1="216" x2="390" y2="290" stroke="black" />
+<line x1="182" y1="216" x2="150" y2="290" stroke="black" />
+<line x1="752" y1="216" x2="406" y2="290" stroke="black" />
+<line x1="182" y1="216" x2="166" y2="290" stroke="black" />
+<line x1="752" y1="216" x2="422" y2="290" stroke="black" />
+<line x1="182" y1="216" x2="182" y2="290" stroke="black" />
+<line x1="752" y1="216" x2="438" y2="290" stroke="black" />
+<line x1="182" y1="216" x2="198" y2="290" stroke="black" />
+<line x1="752" y1="216" x2="454" y2="290" stroke="black" />
+<line x1="182" y1="216" x2="214" y2="290" stroke="black" />
+<line x1="752" y1="216" x2="470" y2="290" stroke="black" />
+<line x1="182" y1="216" x2="230" y2="290" stroke="black" />
+<line x1="752" y1="216" x2="486" y2="290" stroke="black" />
+<line x1="182" y1="216" x2="246" y2="290" stroke="black" />
+<line x1="752" y1="216" x2="502" y2="290" stroke="black" />
+<line x1="182" y1="216" x2="262" y2="290" stroke="black" />
+<line x1="752" y1="216" x2="518" y2="290" stroke="black" />
+<line x1="182" y1="216" x2="278" y2="290" stroke="black" />
+<line x1="752" y1="216" x2="534" y2="290" stroke="black" />
+<line x1="182" y1="216" x2="294" y2="290" stroke="black" />
+<line x1="752" y1="216" x2="550" y2="290" stroke="black" />
+<line x1="182" y1="216" x2="310" y2="290" stroke="black" />
+<line x1="752" y1="216" x2="566" y2="290" stroke="black" />
+<line x1="182" y1="216" x2="326" y2="290" stroke="black" />
+<line x1="752" y1="216" x2="582" y2="290" stroke="black" />
+<line x1="182" y1="216" x2="342" y2="290" stroke="black" />
+<line x1="752" y1="216" x2="598" y2="290" stroke="black" />
+<line x1="182" y1="216" x2="358" y2="290" stroke="black" />
+<line x1="752" y1="216" x2="614" y2="290" stroke="black" />
+<line x1="198" y1="216" x2="118" y2="290" stroke="black" />
+<line x1="768" y1="216" x2="374" y2="290" stroke="black" />
+<line x1="198" y1="216" x2="134" y2="290" stroke="black" />
+<line x1="768" y1="216" x2="390" y2="290" stroke="black" />
+<line x1="198" y1="216" x2="150" y2="290" stroke="black" />
+<line x1="768" y1="216" x2="406" y2="290" stroke="black" />
+<line x1="198" y1="216" x2="166" y2="290" stroke="black" />
+<line x1="768" y1="216" x2="422" y2="290" stroke="black" />
+<line x1="198" y1="216" x2="182" y2="290" stroke="black" />
+<line x1="768" y1="216" x2="438" y2="290" stroke="black" />
+<line x1="198" y1="216" x2="198" y2="290" stroke="black" />
+<line x1="768" y1="216" x2="454" y2="290" stroke="black" />
+<line x1="198" y1="216" x2="214" y2="290" stroke="black" />
+<line x1="768" y1="216" x2="470" y2="290" stroke="black" />
+<line x1="198" y1="216" x2="230" y2="290" stroke="black" />
+<line x1="768" y1="216" x2="486" y2="290" stroke="black" />
+<line x1="198" y1="216" x2="246" y2="290" stroke="black" />
+<line x1="768" y1="216" x2="502" y2="290" stroke="black" />
+<line x1="198" y1="216" x2="262" y2="290" stroke="black" />
+<line x1="768" y1="216" x2="518" y2="290" stroke="black" />
+<line x1="198" y1="216" x2="278" y2="290" stroke="black" />
+<line x1="768" y1="216" x2="534" y2="290" stroke="black" />
+<line x1="198" y1="216" x2="294" y2="290" stroke="black" />
+<line x1="768" y1="216" x2="550" y2="290" stroke="black" />
+<line x1="198" y1="216" x2="310" y2="290" stroke="black" />
+<line x1="768" y1="216" x2="566" y2="290" stroke="black" />
+<line x1="198" y1="216" x2="326" y2="290" stroke="black" />
+<line x1="768" y1="216" x2="582" y2="290" stroke="black" />
+<line x1="198" y1="216" x2="342" y2="290" stroke="black" />
+<line x1="768" y1="216" x2="598" y2="290" stroke="black" />
+<line x1="198" y1="216" x2="358" y2="290" stroke="black" />
+<line x1="768" y1="216" x2="614" y2="290" stroke="black" />
+<line x1="214" y1="216" x2="118" y2="290" stroke="black" />
+<line x1="784" y1="216" x2="374" y2="290" stroke="black" />
+<line x1="214" y1="216" x2="134" y2="290" stroke="black" />
+<line x1="784" y1="216" x2="390" y2="290" stroke="black" />
+<line x1="214" y1="216" x2="150" y2="290" stroke="black" />
+<line x1="784" y1="216" x2="406" y2="290" stroke="black" />
+<line x1="214" y1="216" x2="166" y2="290" stroke="black" />
+<line x1="784" y1="216" x2="422" y2="290" stroke="black" />
+<line x1="214" y1="216" x2="182" y2="290" stroke="black" />
+<line x1="784" y1="216" x2="438" y2="290" stroke="black" />
+<line x1="214" y1="216" x2="198" y2="290" stroke="black" />
+<line x1="784" y1="216" x2="454" y2="290" stroke="black" />
+<line x1="214" y1="216" x2="214" y2="290" stroke="black" />
+<line x1="784" y1="216" x2="470" y2="290" stroke="black" />
+<line x1="214" y1="216" x2="230" y2="290" stroke="black" />
+<line x1="784" y1="216" x2="486" y2="290" stroke="black" />
+<line x1="214" y1="216" x2="246" y2="290" stroke="black" />
+<line x1="784" y1="216" x2="502" y2="290" stroke="black" />
+<line x1="214" y1="216" x2="262" y2="290" stroke="black" />
+<line x1="784" y1="216" x2="518" y2="290" stroke="black" />
+<line x1="214" y1="216" x2="278" y2="290" stroke="black" />
+<line x1="784" y1="216" x2="534" y2="290" stroke="black" />
+<line x1="214" y1="216" x2="294" y2="290" stroke="black" />
+<line x1="784" y1="216" x2="550" y2="290" stroke="black" />
+<line x1="214" y1="216" x2="310" y2="290" stroke="black" />
+<line x1="784" y1="216" x2="566" y2="290" stroke="black" />
+<line x1="214" y1="216" x2="326" y2="290" stroke="black" />
+<line x1="784" y1="216" x2="582" y2="290" stroke="black" />
+<line x1="214" y1="216" x2="342" y2="290" stroke="black" />
+<line x1="784" y1="216" x2="598" y2="290" stroke="black" />
+<line x1="214" y1="216" x2="358" y2="290" stroke="black" />
+<line x1="784" y1="216" x2="614" y2="290" stroke="black" />
+<line x1="230" y1="216" x2="118" y2="290" stroke="black" />
+<line x1="800" y1="216" x2="374" y2="290" stroke="black" />
+<line x1="230" y1="216" x2="134" y2="290" stroke="black" />
+<line x1="800" y1="216" x2="390" y2="290" stroke="black" />
+<line x1="230" y1="216" x2="150" y2="290" stroke="black" />
+<line x1="800" y1="216" x2="406" y2="290" stroke="black" />
+<line x1="230" y1="216" x2="166" y2="290" stroke="black" />
+<line x1="800" y1="216" x2="422" y2="290" stroke="black" />
+<line x1="230" y1="216" x2="182" y2="290" stroke="black" />
+<line x1="800" y1="216" x2="438" y2="290" stroke="black" />
+<line x1="230" y1="216" x2="198" y2="290" stroke="black" />
+<line x1="800" y1="216" x2="454" y2="290" stroke="black" />
+<line x1="230" y1="216" x2="214" y2="290" stroke="black" />
+<line x1="800" y1="216" x2="470" y2="290" stroke="black" />
+<line x1="230" y1="216" x2="230" y2="290" stroke="black" />
+<line x1="800" y1="216" x2="486" y2="290" stroke="black" />
+<line x1="230" y1="216" x2="246" y2="290" stroke="black" />
+<line x1="800" y1="216" x2="502" y2="290" stroke="black" />
+<line x1="230" y1="216" x2="262" y2="290" stroke="black" />
+<line x1="800" y1="216" x2="518" y2="290" stroke="black" />
+<line x1="230" y1="216" x2="278" y2="290" stroke="black" />
+<line x1="800" y1="216" x2="534" y2="290" stroke="black" />
+<line x1="230" y1="216" x2="294" y2="290" stroke="black" />
+<line x1="800" y1="216" x2="550" y2="290" stroke="black" />
+<line x1="230" y1="216" x2="310" y2="290" stroke="black" />
+<line x1="800" y1="216" x2="566" y2="290" stroke="black" />
+<line x1="230" y1="216" x2="326" y2="290" stroke="black" />
+<line x1="800" y1="216" x2="582" y2="290" stroke="black" />
+<line x1="230" y1="216" x2="342" y2="290" stroke="black" />
+<line x1="800" y1="216" x2="598" y2="290" stroke="black" />
+<line x1="230" y1="216" x2="358" y2="290" stroke="black" />
+<line x1="800" y1="216" x2="614" y2="290" stroke="black" />
+<line x1="246" y1="216" x2="118" y2="290" stroke="black" />
+<line x1="816" y1="216" x2="374" y2="290" stroke="black" />
+<line x1="246" y1="216" x2="134" y2="290" stroke="black" />
+<line x1="816" y1="216" x2="390" y2="290" stroke="black" />
+<line x1="246" y1="216" x2="150" y2="290" stroke="black" />
+<line x1="816" y1="216" x2="406" y2="290" stroke="black" />
+<line x1="246" y1="216" x2="166" y2="290" stroke="black" />
+<line x1="816" y1="216" x2="422" y2="290" stroke="black" />
+<line x1="246" y1="216" x2="182" y2="290" stroke="black" />
+<line x1="816" y1="216" x2="438" y2="290" stroke="black" />
+<line x1="246" y1="216" x2="198" y2="290" stroke="black" />
+<line x1="816" y1="216" x2="454" y2="290" stroke="black" />
+<line x1="246" y1="216" x2="214" y2="290" stroke="black" />
+<line x1="816" y1="216" x2="470" y2="290" stroke="black" />
+<line x1="246" y1="216" x2="230" y2="290" stroke="black" />
+<line x1="816" y1="216" x2="486" y2="290" stroke="black" />
+<line x1="246" y1="216" x2="246" y2="290" stroke="black" />
+<line x1="816" y1="216" x2="502" y2="290" stroke="black" />
+<line x1="246" y1="216" x2="262" y2="290" stroke="black" />
+<line x1="816" y1="216" x2="518" y2="290" stroke="black" />
+<line x1="246" y1="216" x2="278" y2="290" stroke="black" />
+<line x1="816" y1="216" x2="534" y2="290" stroke="black" />
+<line x1="246" y1="216" x2="294" y2="290" stroke="black" />
+<line x1="816" y1="216" x2="550" y2="290" stroke="black" />
+<line x1="246" y1="216" x2="310" y2="290" stroke="black" />
+<line x1="816" y1="216" x2="566" y2="290" stroke="black" />
+<line x1="246" y1="216" x2="326" y2="290" stroke="black" />
+<line x1="816" y1="216" x2="582" y2="290" stroke="black" />
+<line x1="246" y1="216" x2="342" y2="290" stroke="black" />
+<line x1="816" y1="216" x2="598" y2="290" stroke="black" />
+<line x1="246" y1="216" x2="358" y2="290" stroke="black" />
+<line x1="816" y1="216" x2="614" y2="290" stroke="black" />
+<line x1="262" y1="216" x2="118" y2="290" stroke="black" />
+<line x1="832" y1="216" x2="374" y2="290" stroke="black" />
+<line x1="262" y1="216" x2="134" y2="290" stroke="black" />
+<line x1="832" y1="216" x2="390" y2="290" stroke="black" />
+<line x1="262" y1="216" x2="150" y2="290" stroke="black" />
+<line x1="832" y1="216" x2="406" y2="290" stroke="black" />
+<line x1="262" y1="216" x2="166" y2="290" stroke="black" />
+<line x1="832" y1="216" x2="422" y2="290" stroke="black" />
+<line x1="262" y1="216" x2="182" y2="290" stroke="black" />
+<line x1="832" y1="216" x2="438" y2="290" stroke="black" />
+<line x1="262" y1="216" x2="198" y2="290" stroke="black" />
+<line x1="832" y1="216" x2="454" y2="290" stroke="black" />
+<line x1="262" y1="216" x2="214" y2="290" stroke="black" />
+<line x1="832" y1="216" x2="470" y2="290" stroke="black" />
+<line x1="262" y1="216" x2="230" y2="290" stroke="black" />
+<line x1="832" y1="216" x2="486" y2="290" stroke="black" />
+<line x1="262" y1="216" x2="246" y2="290" stroke="black" />
+<line x1="832" y1="216" x2="502" y2="290" stroke="black" />
+<line x1="262" y1="216" x2="262" y2="290" stroke="black" />
+<line x1="832" y1="216" x2="518" y2="290" stroke="black" />
+<line x1="262" y1="216" x2="278" y2="290" stroke="black" />
+<line x1="832" y1="216" x2="534" y2="290" stroke="black" />
+<line x1="262" y1="216" x2="294" y2="290" stroke="black" />
+<line x1="832" y1="216" x2="550" y2="290" stroke="black" />
+<line x1="262" y1="216" x2="310" y2="290" stroke="black" />
+<line x1="832" y1="216" x2="566" y2="290" stroke="black" />
+<line x1="262" y1="216" x2="326" y2="290" stroke="black" />
+<line x1="832" y1="216" x2="582" y2="290" stroke="black" />
+<line x1="262" y1="216" x2="342" y2="290" stroke="black" />
+<line x1="832" y1="216" x2="598" y2="290" stroke="black" />
+<line x1="262" y1="216" x2="358" y2="290" stroke="black" />
+<line x1="832" y1="216" x2="614" y2="290" stroke="black" />
+<line x1="278" y1="216" x2="118" y2="290" stroke="black" />
+<line x1="848" y1="216" x2="374" y2="290" stroke="black" />
+<line x1="278" y1="216" x2="134" y2="290" stroke="black" />
+<line x1="848" y1="216" x2="390" y2="290" stroke="black" />
+<line x1="278" y1="216" x2="150" y2="290" stroke="black" />
+<line x1="848" y1="216" x2="406" y2="290" stroke="black" />
+<line x1="278" y1="216" x2="166" y2="290" stroke="black" />
+<line x1="848" y1="216" x2="422" y2="290" stroke="black" />
+<line x1="278" y1="216" x2="182" y2="290" stroke="black" />
+<line x1="848" y1="216" x2="438" y2="290" stroke="black" />
+<line x1="278" y1="216" x2="198" y2="290" stroke="black" />
+<line x1="848" y1="216" x2="454" y2="290" stroke="black" />
+<line x1="278" y1="216" x2="214" y2="290" stroke="black" />
+<line x1="848" y1="216" x2="470" y2="290" stroke="black" />
+<line x1="278" y1="216" x2="230" y2="290" stroke="black" />
+<line x1="848" y1="216" x2="486" y2="290" stroke="black" />
+<line x1="278" y1="216" x2="246" y2="290" stroke="black" />
+<line x1="848" y1="216" x2="502" y2="290" stroke="black" />
+<line x1="278" y1="216" x2="262" y2="290" stroke="black" />
+<line x1="848" y1="216" x2="518" y2="290" stroke="black" />
+<line x1="278" y1="216" x2="278" y2="290" stroke="black" />
+<line x1="848" y1="216" x2="534" y2="290" stroke="black" />
+<line x1="278" y1="216" x2="294" y2="290" stroke="black" />
+<line x1="848" y1="216" x2="550" y2="290" stroke="black" />
+<line x1="278" y1="216" x2="310" y2="290" stroke="black" />
+<line x1="848" y1="216" x2="566" y2="290" stroke="black" />
+<line x1="278" y1="216" x2="326" y2="290" stroke="black" />
+<line x1="848" y1="216" x2="582" y2="290" stroke="black" />
+<line x1="278" y1="216" x2="342" y2="290" stroke="black" />
+<line x1="848" y1="216" x2="598" y2="290" stroke="black" />
+<line x1="278" y1="216" x2="358" y2="290" stroke="black" />
+<line x1="848" y1="216" x2="614" y2="290" stroke="black" />
+<line x1="294" y1="216" x2="118" y2="290" stroke="black" />
+<line x1="864" y1="216" x2="374" y2="290" stroke="black" />
+<line x1="294" y1="216" x2="134" y2="290" stroke="black" />
+<line x1="864" y1="216" x2="390" y2="290" stroke="black" />
+<line x1="294" y1="216" x2="150" y2="290" stroke="black" />
+<line x1="864" y1="216" x2="406" y2="290" stroke="black" />
+<line x1="294" y1="216" x2="166" y2="290" stroke="black" />
+<line x1="864" y1="216" x2="422" y2="290" stroke="black" />
+<line x1="294" y1="216" x2="182" y2="290" stroke="black" />
+<line x1="864" y1="216" x2="438" y2="290" stroke="black" />
+<line x1="294" y1="216" x2="198" y2="290" stroke="black" />
+<line x1="864" y1="216" x2="454" y2="290" stroke="black" />
+<line x1="294" y1="216" x2="214" y2="290" stroke="black" />
+<line x1="864" y1="216" x2="470" y2="290" stroke="black" />
+<line x1="294" y1="216" x2="230" y2="290" stroke="black" />
+<line x1="864" y1="216" x2="486" y2="290" stroke="black" />
+<line x1="294" y1="216" x2="246" y2="290" stroke="black" />
+<line x1="864" y1="216" x2="502" y2="290" stroke="black" />
+<line x1="294" y1="216" x2="262" y2="290" stroke="black" />
+<line x1="864" y1="216" x2="518" y2="290" stroke="black" />
+<line x1="294" y1="216" x2="278" y2="290" stroke="black" />
+<line x1="864" y1="216" x2="534" y2="290" stroke="black" />
+<line x1="294" y1="216" x2="294" y2="290" stroke="black" />
+<line x1="864" y1="216" x2="550" y2="290" stroke="black" />
+<line x1="294" y1="216" x2="310" y2="290" stroke="black" />
+<line x1="864" y1="216" x2="566" y2="290" stroke="black" />
+<line x1="294" y1="216" x2="326" y2="290" stroke="black" />
+<line x1="864" y1="216" x2="582" y2="290" stroke="black" />
+<line x1="294" y1="216" x2="342" y2="290" stroke="black" />
+<line x1="864" y1="216" x2="598" y2="290" stroke="black" />
+<line x1="294" y1="216" x2="358" y2="290" stroke="black" />
+<line x1="864" y1="216" x2="614" y2="290" stroke="black" />
+<line x1="310" y1="216" x2="118" y2="290" stroke="black" />
+<line x1="880" y1="216" x2="374" y2="290" stroke="black" />
+<line x1="310" y1="216" x2="134" y2="290" stroke="black" />
+<line x1="880" y1="216" x2="390" y2="290" stroke="black" />
+<line x1="310" y1="216" x2="150" y2="290" stroke="black" />
+<line x1="880" y1="216" x2="406" y2="290" stroke="black" />
+<line x1="310" y1="216" x2="166" y2="290" stroke="black" />
+<line x1="880" y1="216" x2="422" y2="290" stroke="black" />
+<line x1="310" y1="216" x2="182" y2="290" stroke="black" />
+<line x1="880" y1="216" x2="438" y2="290" stroke="black" />
+<line x1="310" y1="216" x2="198" y2="290" stroke="black" />
+<line x1="880" y1="216" x2="454" y2="290" stroke="black" />
+<line x1="310" y1="216" x2="214" y2="290" stroke="black" />
+<line x1="880" y1="216" x2="470" y2="290" stroke="black" />
+<line x1="310" y1="216" x2="230" y2="290" stroke="black" />
+<line x1="880" y1="216" x2="486" y2="290" stroke="black" />
+<line x1="310" y1="216" x2="246" y2="290" stroke="black" />
+<line x1="880" y1="216" x2="502" y2="290" stroke="black" />
+<line x1="310" y1="216" x2="262" y2="290" stroke="black" />
+<line x1="880" y1="216" x2="518" y2="290" stroke="black" />
+<line x1="310" y1="216" x2="278" y2="290" stroke="black" />
+<line x1="880" y1="216" x2="534" y2="290" stroke="black" />
+<line x1="310" y1="216" x2="294" y2="290" stroke="black" />
+<line x1="880" y1="216" x2="550" y2="290" stroke="black" />
+<line x1="310" y1="216" x2="310" y2="290" stroke="black" />
+<line x1="880" y1="216" x2="566" y2="290" stroke="black" />
+<line x1="310" y1="216" x2="326" y2="290" stroke="black" />
+<line x1="880" y1="216" x2="582" y2="290" stroke="black" />
+<line x1="310" y1="216" x2="342" y2="290" stroke="black" />
+<line x1="880" y1="216" x2="598" y2="290" stroke="black" />
+<line x1="310" y1="216" x2="358" y2="290" stroke="black" />
+<line x1="880" y1="216" x2="614" y2="290" stroke="black" />
+<line x1="326" y1="216" x2="118" y2="290" stroke="black" />
+<line x1="896" y1="216" x2="374" y2="290" stroke="black" />
+<line x1="326" y1="216" x2="134" y2="290" stroke="black" />
+<line x1="896" y1="216" x2="390" y2="290" stroke="black" />
+<line x1="326" y1="216" x2="150" y2="290" stroke="black" />
+<line x1="896" y1="216" x2="406" y2="290" stroke="black" />
+<line x1="326" y1="216" x2="166" y2="290" stroke="black" />
+<line x1="896" y1="216" x2="422" y2="290" stroke="black" />
+<line x1="326" y1="216" x2="182" y2="290" stroke="black" />
+<line x1="896" y1="216" x2="438" y2="290" stroke="black" />
+<line x1="326" y1="216" x2="198" y2="290" stroke="black" />
+<line x1="896" y1="216" x2="454" y2="290" stroke="black" />
+<line x1="326" y1="216" x2="214" y2="290" stroke="black" />
+<line x1="896" y1="216" x2="470" y2="290" stroke="black" />
+<line x1="326" y1="216" x2="230" y2="290" stroke="black" />
+<line x1="896" y1="216" x2="486" y2="290" stroke="black" />
+<line x1="326" y1="216" x2="246" y2="290" stroke="black" />
+<line x1="896" y1="216" x2="502" y2="290" stroke="black" />
+<line x1="326" y1="216" x2="262" y2="290" stroke="black" />
+<line x1="896" y1="216" x2="518" y2="290" stroke="black" />
+<line x1="326" y1="216" x2="278" y2="290" stroke="black" />
+<line x1="896" y1="216" x2="534" y2="290" stroke="black" />
+<line x1="326" y1="216" x2="294" y2="290" stroke="black" />
+<line x1="896" y1="216" x2="550" y2="290" stroke="black" />
+<line x1="326" y1="216" x2="310" y2="290" stroke="black" />
+<line x1="896" y1="216" x2="566" y2="290" stroke="black" />
+<line x1="326" y1="216" x2="326" y2="290" stroke="black" />
+<line x1="896" y1="216" x2="582" y2="290" stroke="black" />
+<line x1="326" y1="216" x2="342" y2="290" stroke="black" />
+<line x1="896" y1="216" x2="598" y2="290" stroke="black" />
+<line x1="326" y1="216" x2="358" y2="290" stroke="black" />
+<line x1="896" y1="216" x2="614" y2="290" stroke="black" />
+<line x1="342" y1="216" x2="118" y2="290" stroke="black" />
+<line x1="912" y1="216" x2="374" y2="290" stroke="black" />
+<line x1="342" y1="216" x2="134" y2="290" stroke="black" />
+<line x1="912" y1="216" x2="390" y2="290" stroke="black" />
+<line x1="342" y1="216" x2="150" y2="290" stroke="black" />
+<line x1="912" y1="216" x2="406" y2="290" stroke="black" />
+<line x1="342" y1="216" x2="166" y2="290" stroke="black" />
+<line x1="912" y1="216" x2="422" y2="290" stroke="black" />
+<line x1="342" y1="216" x2="182" y2="290" stroke="black" />
+<line x1="912" y1="216" x2="438" y2="290" stroke="black" />
+<line x1="342" y1="216" x2="198" y2="290" stroke="black" />
+<line x1="912" y1="216" x2="454" y2="290" stroke="black" />
+<line x1="342" y1="216" x2="214" y2="290" stroke="black" />
+<line x1="912" y1="216" x2="470" y2="290" stroke="black" />
+<line x1="342" y1="216" x2="230" y2="290" stroke="black" />
+<line x1="912" y1="216" x2="486" y2="290" stroke="black" />
+<line x1="342" y1="216" x2="246" y2="290" stroke="black" />
+<line x1="912" y1="216" x2="502" y2="290" stroke="black" />
+<line x1="342" y1="216" x2="262" y2="290" stroke="black" />
+<line x1="912" y1="216" x2="518" y2="290" stroke="black" />
+<line x1="342" y1="216" x2="278" y2="290" stroke="black" />
+<line x1="912" y1="216" x2="534" y2="290" stroke="black" />
+<line x1="342" y1="216" x2="294" y2="290" stroke="black" />
+<line x1="912" y1="216" x2="550" y2="290" stroke="black" />
+<line x1="342" y1="216" x2="310" y2="290" stroke="black" />
+<line x1="912" y1="216" x2="566" y2="290" stroke="black" />
+<line x1="342" y1="216" x2="326" y2="290" stroke="black" />
+<line x1="912" y1="216" x2="582" y2="290" stroke="black" />
+<line x1="342" y1="216" x2="342" y2="290" stroke="black" />
+<line x1="912" y1="216" x2="598" y2="290" stroke="black" />
+<line x1="342" y1="216" x2="358" y2="290" stroke="black" />
+<line x1="912" y1="216" x2="614" y2="290" stroke="black" />
+<line x1="358" y1="216" x2="118" y2="290" stroke="black" />
+<line x1="928" y1="216" x2="374" y2="290" stroke="black" />
+<line x1="358" y1="216" x2="134" y2="290" stroke="black" />
+<line x1="928" y1="216" x2="390" y2="290" stroke="black" />
+<line x1="358" y1="216" x2="150" y2="290" stroke="black" />
+<line x1="928" y1="216" x2="406" y2="290" stroke="black" />
+<line x1="358" y1="216" x2="166" y2="290" stroke="black" />
+<line x1="928" y1="216" x2="422" y2="290" stroke="black" />
+<line x1="358" y1="216" x2="182" y2="290" stroke="black" />
+<line x1="928" y1="216" x2="438" y2="290" stroke="black" />
+<line x1="358" y1="216" x2="198" y2="290" stroke="black" />
+<line x1="928" y1="216" x2="454" y2="290" stroke="black" />
+<line x1="358" y1="216" x2="214" y2="290" stroke="black" />
+<line x1="928" y1="216" x2="470" y2="290" stroke="black" />
+<line x1="358" y1="216" x2="230" y2="290" stroke="black" />
+<line x1="928" y1="216" x2="486" y2="290" stroke="black" />
+<line x1="358" y1="216" x2="246" y2="290" stroke="black" />
+<line x1="928" y1="216" x2="502" y2="290" stroke="black" />
+<line x1="358" y1="216" x2="262" y2="290" stroke="black" />
+<line x1="928" y1="216" x2="518" y2="290" stroke="black" />
+<line x1="358" y1="216" x2="278" y2="290" stroke="black" />
+<line x1="928" y1="216" x2="534" y2="290" stroke="black" />
+<line x1="358" y1="216" x2="294" y2="290" stroke="black" />
+<line x1="928" y1="216" x2="550" y2="290" stroke="black" />
+<line x1="358" y1="216" x2="310" y2="290" stroke="black" />
+<line x1="928" y1="216" x2="566" y2="290" stroke="black" />
+<line x1="358" y1="216" x2="326" y2="290" stroke="black" />
+<line x1="928" y1="216" x2="582" y2="290" stroke="black" />
+<line x1="358" y1="216" x2="342" y2="290" stroke="black" />
+<line x1="928" y1="216" x2="598" y2="290" stroke="black" />
+<line x1="358" y1="216" x2="358" y2="290" stroke="black" />
+<line x1="928" y1="216" x2="614" y2="290" stroke="black" />
+<line x1="374" y1="216" x2="118" y2="290" stroke="black" />
+<line x1="944" y1="216" x2="374" y2="290" stroke="black" />
+<line x1="374" y1="216" x2="134" y2="290" stroke="black" />
+<line x1="944" y1="216" x2="390" y2="290" stroke="black" />
+<line x1="374" y1="216" x2="150" y2="290" stroke="black" />
+<line x1="944" y1="216" x2="406" y2="290" stroke="black" />
+<line x1="374" y1="216" x2="166" y2="290" stroke="black" />
+<line x1="944" y1="216" x2="422" y2="290" stroke="black" />
+<line x1="374" y1="216" x2="182" y2="290" stroke="black" />
+<line x1="944" y1="216" x2="438" y2="290" stroke="black" />
+<line x1="374" y1="216" x2="198" y2="290" stroke="black" />
+<line x1="944" y1="216" x2="454" y2="290" stroke="black" />
+<line x1="374" y1="216" x2="214" y2="290" stroke="black" />
+<line x1="944" y1="216" x2="470" y2="290" stroke="black" />
+<line x1="374" y1="216" x2="230" y2="290" stroke="black" />
+<line x1="944" y1="216" x2="486" y2="290" stroke="black" />
+<line x1="374" y1="216" x2="246" y2="290" stroke="black" />
+<line x1="944" y1="216" x2="502" y2="290" stroke="black" />
+<line x1="374" y1="216" x2="262" y2="290" stroke="black" />
+<line x1="944" y1="216" x2="518" y2="290" stroke="black" />
+<line x1="374" y1="216" x2="278" y2="290" stroke="black" />
+<line x1="944" y1="216" x2="534" y2="290" stroke="black" />
+<line x1="374" y1="216" x2="294" y2="290" stroke="black" />
+<line x1="944" y1="216" x2="550" y2="290" stroke="black" />
+<line x1="374" y1="216" x2="310" y2="290" stroke="black" />
+<line x1="944" y1="216" x2="566" y2="290" stroke="black" />
+<line x1="374" y1="216" x2="326" y2="290" stroke="black" />
+<line x1="944" y1="216" x2="582" y2="290" stroke="black" />
+<line x1="374" y1="216" x2="342" y2="290" stroke="black" />
+<line x1="944" y1="216" x2="598" y2="290" stroke="black" />
+<line x1="374" y1="216" x2="358" y2="290" stroke="black" />
+<line x1="944" y1="216" x2="614" y2="290" stroke="black" />
+<line x1="390" y1="216" x2="118" y2="290" stroke="black" />
+<line x1="960" y1="216" x2="374" y2="290" stroke="black" />
+<line x1="390" y1="216" x2="134" y2="290" stroke="black" />
+<line x1="960" y1="216" x2="390" y2="290" stroke="black" />
+<line x1="390" y1="216" x2="150" y2="290" stroke="black" />
+<line x1="960" y1="216" x2="406" y2="290" stroke="black" />
+<line x1="390" y1="216" x2="166" y2="290" stroke="black" />
+<line x1="960" y1="216" x2="422" y2="290" stroke="black" />
+<line x1="390" y1="216" x2="182" y2="290" stroke="black" />
+<line x1="960" y1="216" x2="438" y2="290" stroke="black" />
+<line x1="390" y1="216" x2="198" y2="290" stroke="black" />
+<line x1="960" y1="216" x2="454" y2="290" stroke="black" />
+<line x1="390" y1="216" x2="214" y2="290" stroke="black" />
+<line x1="960" y1="216" x2="470" y2="290" stroke="black" />
+<line x1="390" y1="216" x2="230" y2="290" stroke="black" />
+<line x1="960" y1="216" x2="486" y2="290" stroke="black" />
+<line x1="390" y1="216" x2="246" y2="290" stroke="black" />
+<line x1="960" y1="216" x2="502" y2="290" stroke="black" />
+<line x1="390" y1="216" x2="262" y2="290" stroke="black" />
+<line x1="960" y1="216" x2="518" y2="290" stroke="black" />
+<line x1="390" y1="216" x2="278" y2="290" stroke="black" />
+<line x1="960" y1="216" x2="534" y2="290" stroke="black" />
+<line x1="390" y1="216" x2="294" y2="290" stroke="black" />
+<line x1="960" y1="216" x2="550" y2="290" stroke="black" />
+<line x1="390" y1="216" x2="310" y2="290" stroke="black" />
+<line x1="960" y1="216" x2="566" y2="290" stroke="black" />
+<line x1="390" y1="216" x2="326" y2="290" stroke="black" />
+<line x1="960" y1="216" x2="582" y2="290" stroke="black" />
+<line x1="390" y1="216" x2="342" y2="290" stroke="black" />
+<line x1="960" y1="216" x2="598" y2="290" stroke="black" />
+<line x1="390" y1="216" x2="358" y2="290" stroke="black" />
+<line x1="960" y1="216" x2="614" y2="290" stroke="black" />
+<line x1="406" y1="216" x2="118" y2="290" stroke="black" />
+<line x1="976" y1="216" x2="374" y2="290" stroke="black" />
+<line x1="406" y1="216" x2="134" y2="290" stroke="black" />
+<line x1="976" y1="216" x2="390" y2="290" stroke="black" />
+<line x1="406" y1="216" x2="150" y2="290" stroke="black" />
+<line x1="976" y1="216" x2="406" y2="290" stroke="black" />
+<line x1="406" y1="216" x2="166" y2="290" stroke="black" />
+<line x1="976" y1="216" x2="422" y2="290" stroke="black" />
+<line x1="406" y1="216" x2="182" y2="290" stroke="black" />
+<line x1="976" y1="216" x2="438" y2="290" stroke="black" />
+<line x1="406" y1="216" x2="198" y2="290" stroke="black" />
+<line x1="976" y1="216" x2="454" y2="290" stroke="black" />
+<line x1="406" y1="216" x2="214" y2="290" stroke="black" />
+<line x1="976" y1="216" x2="470" y2="290" stroke="black" />
+<line x1="406" y1="216" x2="230" y2="290" stroke="black" />
+<line x1="976" y1="216" x2="486" y2="290" stroke="black" />
+<line x1="406" y1="216" x2="246" y2="290" stroke="black" />
+<line x1="976" y1="216" x2="502" y2="290" stroke="black" />
+<line x1="406" y1="216" x2="262" y2="290" stroke="black" />
+<line x1="976" y1="216" x2="518" y2="290" stroke="black" />
+<line x1="406" y1="216" x2="278" y2="290" stroke="black" />
+<line x1="976" y1="216" x2="534" y2="290" stroke="black" />
+<line x1="406" y1="216" x2="294" y2="290" stroke="black" />
+<line x1="976" y1="216" x2="550" y2="290" stroke="black" />
+<line x1="406" y1="216" x2="310" y2="290" stroke="black" />
+<line x1="976" y1="216" x2="566" y2="290" stroke="black" />
+<line x1="406" y1="216" x2="326" y2="290" stroke="black" />
+<line x1="976" y1="216" x2="582" y2="290" stroke="black" />
+<line x1="406" y1="216" x2="342" y2="290" stroke="black" />
+<line x1="976" y1="216" x2="598" y2="290" stroke="black" />
+<line x1="406" y1="216" x2="358" y2="290" stroke="black" />
+<line x1="976" y1="216" x2="614" y2="290" stroke="black" />
+<line x1="422" y1="216" x2="118" y2="290" stroke="black" />
+<line x1="992" y1="216" x2="374" y2="290" stroke="black" />
+<line x1="422" y1="216" x2="134" y2="290" stroke="black" />
+<line x1="992" y1="216" x2="390" y2="290" stroke="black" />
+<line x1="422" y1="216" x2="150" y2="290" stroke="black" />
+<line x1="992" y1="216" x2="406" y2="290" stroke="black" />
+<line x1="422" y1="216" x2="166" y2="290" stroke="black" />
+<line x1="992" y1="216" x2="422" y2="290" stroke="black" />
+<line x1="422" y1="216" x2="182" y2="290" stroke="black" />
+<line x1="992" y1="216" x2="438" y2="290" stroke="black" />
+<line x1="422" y1="216" x2="198" y2="290" stroke="black" />
+<line x1="992" y1="216" x2="454" y2="290" stroke="black" />
+<line x1="422" y1="216" x2="214" y2="290" stroke="black" />
+<line x1="992" y1="216" x2="470" y2="290" stroke="black" />
+<line x1="422" y1="216" x2="230" y2="290" stroke="black" />
+<line x1="992" y1="216" x2="486" y2="290" stroke="black" />
+<line x1="422" y1="216" x2="246" y2="290" stroke="black" />
+<line x1="992" y1="216" x2="502" y2="290" stroke="black" />
+<line x1="422" y1="216" x2="262" y2="290" stroke="black" />
+<line x1="992" y1="216" x2="518" y2="290" stroke="black" />
+<line x1="422" y1="216" x2="278" y2="290" stroke="black" />
+<line x1="992" y1="216" x2="534" y2="290" stroke="black" />
+<line x1="422" y1="216" x2="294" y2="290" stroke="black" />
+<line x1="992" y1="216" x2="550" y2="290" stroke="black" />
+<line x1="422" y1="216" x2="310" y2="290" stroke="black" />
+<line x1="992" y1="216" x2="566" y2="290" stroke="black" />
+<line x1="422" y1="216" x2="326" y2="290" stroke="black" />
+<line x1="992" y1="216" x2="582" y2="290" stroke="black" />
+<line x1="422" y1="216" x2="342" y2="290" stroke="black" />
+<line x1="992" y1="216" x2="598" y2="290" stroke="black" />
+<line x1="422" y1="216" x2="358" y2="290" stroke="black" />
+<line x1="992" y1="216" x2="614" y2="290" stroke="black" />
+<line x1="438" y1="216" x2="118" y2="290" stroke="black" />
+<line x1="1008" y1="216" x2="374" y2="290" stroke="black" />
+<line x1="438" y1="216" x2="134" y2="290" stroke="black" />
+<line x1="1008" y1="216" x2="390" y2="290" stroke="black" />
+<line x1="438" y1="216" x2="150" y2="290" stroke="black" />
+<line x1="1008" y1="216" x2="406" y2="290" stroke="black" />
+<line x1="438" y1="216" x2="166" y2="290" stroke="black" />
+<line x1="1008" y1="216" x2="422" y2="290" stroke="black" />
+<line x1="438" y1="216" x2="182" y2="290" stroke="black" />
+<line x1="1008" y1="216" x2="438" y2="290" stroke="black" />
+<line x1="438" y1="216" x2="198" y2="290" stroke="black" />
+<line x1="1008" y1="216" x2="454" y2="290" stroke="black" />
+<line x1="438" y1="216" x2="214" y2="290" stroke="black" />
+<line x1="1008" y1="216" x2="470" y2="290" stroke="black" />
+<line x1="438" y1="216" x2="230" y2="290" stroke="black" />
+<line x1="1008" y1="216" x2="486" y2="290" stroke="black" />
+<line x1="438" y1="216" x2="246" y2="290" stroke="black" />
+<line x1="1008" y1="216" x2="502" y2="290" stroke="black" />
+<line x1="438" y1="216" x2="262" y2="290" stroke="black" />
+<line x1="1008" y1="216" x2="518" y2="290" stroke="black" />
+<line x1="438" y1="216" x2="278" y2="290" stroke="black" />
+<line x1="1008" y1="216" x2="534" y2="290" stroke="black" />
+<line x1="438" y1="216" x2="294" y2="290" stroke="black" />
+<line x1="1008" y1="216" x2="550" y2="290" stroke="black" />
+<line x1="438" y1="216" x2="310" y2="290" stroke="black" />
+<line x1="1008" y1="216" x2="566" y2="290" stroke="black" />
+<line x1="438" y1="216" x2="326" y2="290" stroke="black" />
+<line x1="1008" y1="216" x2="582" y2="290" stroke="black" />
+<line x1="438" y1="216" x2="342" y2="290" stroke="black" />
+<line x1="1008" y1="216" x2="598" y2="290" stroke="black" />
+<line x1="438" y1="216" x2="358" y2="290" stroke="black" />
+<line x1="1008" y1="216" x2="614" y2="290" stroke="black" />
+<line x1="454" y1="216" x2="118" y2="290" stroke="black" />
+<line x1="1024" y1="216" x2="374" y2="290" stroke="black" />
+<line x1="454" y1="216" x2="134" y2="290" stroke="black" />
+<line x1="1024" y1="216" x2="390" y2="290" stroke="black" />
+<line x1="454" y1="216" x2="150" y2="290" stroke="black" />
+<line x1="1024" y1="216" x2="406" y2="290" stroke="black" />
+<line x1="454" y1="216" x2="166" y2="290" stroke="black" />
+<line x1="1024" y1="216" x2="422" y2="290" stroke="black" />
+<line x1="454" y1="216" x2="182" y2="290" stroke="black" />
+<line x1="1024" y1="216" x2="438" y2="290" stroke="black" />
+<line x1="454" y1="216" x2="198" y2="290" stroke="black" />
+<line x1="1024" y1="216" x2="454" y2="290" stroke="black" />
+<line x1="454" y1="216" x2="214" y2="290" stroke="black" />
+<line x1="1024" y1="216" x2="470" y2="290" stroke="black" />
+<line x1="454" y1="216" x2="230" y2="290" stroke="black" />
+<line x1="1024" y1="216" x2="486" y2="290" stroke="black" />
+<line x1="454" y1="216" x2="246" y2="290" stroke="black" />
+<line x1="1024" y1="216" x2="502" y2="290" stroke="black" />
+<line x1="454" y1="216" x2="262" y2="290" stroke="black" />
+<line x1="1024" y1="216" x2="518" y2="290" stroke="black" />
+<line x1="454" y1="216" x2="278" y2="290" stroke="black" />
+<line x1="1024" y1="216" x2="534" y2="290" stroke="black" />
+<line x1="454" y1="216" x2="294" y2="290" stroke="black" />
+<line x1="1024" y1="216" x2="550" y2="290" stroke="black" />
+<line x1="454" y1="216" x2="310" y2="290" stroke="black" />
+<line x1="1024" y1="216" x2="566" y2="290" stroke="black" />
+<line x1="454" y1="216" x2="326" y2="290" stroke="black" />
+<line x1="1024" y1="216" x2="582" y2="290" stroke="black" />
+<line x1="454" y1="216" x2="342" y2="290" stroke="black" />
+<line x1="1024" y1="216" x2="598" y2="290" stroke="black" />
+<line x1="454" y1="216" x2="358" y2="290" stroke="black" />
+<line x1="1024" y1="216" x2="614" y2="290" stroke="black" />
+<line x1="470" y1="216" x2="118" y2="290" stroke="black" />
+<line x1="1040" y1="216" x2="374" y2="290" stroke="black" />
+<line x1="470" y1="216" x2="134" y2="290" stroke="black" />
+<line x1="1040" y1="216" x2="390" y2="290" stroke="black" />
+<line x1="470" y1="216" x2="150" y2="290" stroke="black" />
+<line x1="1040" y1="216" x2="406" y2="290" stroke="black" />
+<line x1="470" y1="216" x2="166" y2="290" stroke="black" />
+<line x1="1040" y1="216" x2="422" y2="290" stroke="black" />
+<line x1="470" y1="216" x2="182" y2="290" stroke="black" />
+<line x1="1040" y1="216" x2="438" y2="290" stroke="black" />
+<line x1="470" y1="216" x2="198" y2="290" stroke="black" />
+<line x1="1040" y1="216" x2="454" y2="290" stroke="black" />
+<line x1="470" y1="216" x2="214" y2="290" stroke="black" />
+<line x1="1040" y1="216" x2="470" y2="290" stroke="black" />
+<line x1="470" y1="216" x2="230" y2="290" stroke="black" />
+<line x1="1040" y1="216" x2="486" y2="290" stroke="black" />
+<line x1="470" y1="216" x2="246" y2="290" stroke="black" />
+<line x1="1040" y1="216" x2="502" y2="290" stroke="black" />
+<line x1="470" y1="216" x2="262" y2="290" stroke="black" />
+<line x1="1040" y1="216" x2="518" y2="290" stroke="black" />
+<line x1="470" y1="216" x2="278" y2="290" stroke="black" />
+<line x1="1040" y1="216" x2="534" y2="290" stroke="black" />
+<line x1="470" y1="216" x2="294" y2="290" stroke="black" />
+<line x1="1040" y1="216" x2="550" y2="290" stroke="black" />
+<line x1="470" y1="216" x2="310" y2="290" stroke="black" />
+<line x1="1040" y1="216" x2="566" y2="290" stroke="black" />
+<line x1="470" y1="216" x2="326" y2="290" stroke="black" />
+<line x1="1040" y1="216" x2="582" y2="290" stroke="black" />
+<line x1="470" y1="216" x2="342" y2="290" stroke="black" />
+<line x1="1040" y1="216" x2="598" y2="290" stroke="black" />
+<line x1="470" y1="216" x2="358" y2="290" stroke="black" />
+<line x1="1040" y1="216" x2="614" y2="290" stroke="black" />
+<line x1="486" y1="216" x2="118" y2="290" stroke="black" />
+<line x1="1056" y1="216" x2="374" y2="290" stroke="black" />
+<line x1="486" y1="216" x2="134" y2="290" stroke="black" />
+<line x1="1056" y1="216" x2="390" y2="290" stroke="black" />
+<line x1="486" y1="216" x2="150" y2="290" stroke="black" />
+<line x1="1056" y1="216" x2="406" y2="290" stroke="black" />
+<line x1="486" y1="216" x2="166" y2="290" stroke="black" />
+<line x1="1056" y1="216" x2="422" y2="290" stroke="black" />
+<line x1="486" y1="216" x2="182" y2="290" stroke="black" />
+<line x1="1056" y1="216" x2="438" y2="290" stroke="black" />
+<line x1="486" y1="216" x2="198" y2="290" stroke="black" />
+<line x1="1056" y1="216" x2="454" y2="290" stroke="black" />
+<line x1="486" y1="216" x2="214" y2="290" stroke="black" />
+<line x1="1056" y1="216" x2="470" y2="290" stroke="black" />
+<line x1="486" y1="216" x2="230" y2="290" stroke="black" />
+<line x1="1056" y1="216" x2="486" y2="290" stroke="black" />
+<line x1="486" y1="216" x2="246" y2="290" stroke="black" />
+<line x1="1056" y1="216" x2="502" y2="290" stroke="black" />
+<line x1="486" y1="216" x2="262" y2="290" stroke="black" />
+<line x1="1056" y1="216" x2="518" y2="290" stroke="black" />
+<line x1="486" y1="216" x2="278" y2="290" stroke="black" />
+<line x1="1056" y1="216" x2="534" y2="290" stroke="black" />
+<line x1="486" y1="216" x2="294" y2="290" stroke="black" />
+<line x1="1056" y1="216" x2="550" y2="290" stroke="black" />
+<line x1="486" y1="216" x2="310" y2="290" stroke="black" />
+<line x1="1056" y1="216" x2="566" y2="290" stroke="black" />
+<line x1="486" y1="216" x2="326" y2="290" stroke="black" />
+<line x1="1056" y1="216" x2="582" y2="290" stroke="black" />
+<line x1="486" y1="216" x2="342" y2="290" stroke="black" />
+<line x1="1056" y1="216" x2="598" y2="290" stroke="black" />
+<line x1="486" y1="216" x2="358" y2="290" stroke="black" />
+<line x1="1056" y1="216" x2="614" y2="290" stroke="black" />
+<line x1="502" y1="216" x2="118" y2="290" stroke="black" />
+<line x1="1072" y1="216" x2="374" y2="290" stroke="black" />
+<line x1="502" y1="216" x2="134" y2="290" stroke="black" />
+<line x1="1072" y1="216" x2="390" y2="290" stroke="black" />
+<line x1="502" y1="216" x2="150" y2="290" stroke="black" />
+<line x1="1072" y1="216" x2="406" y2="290" stroke="black" />
+<line x1="502" y1="216" x2="166" y2="290" stroke="black" />
+<line x1="1072" y1="216" x2="422" y2="290" stroke="black" />
+<line x1="502" y1="216" x2="182" y2="290" stroke="black" />
+<line x1="1072" y1="216" x2="438" y2="290" stroke="black" />
+<line x1="502" y1="216" x2="198" y2="290" stroke="black" />
+<line x1="1072" y1="216" x2="454" y2="290" stroke="black" />
+<line x1="502" y1="216" x2="214" y2="290" stroke="black" />
+<line x1="1072" y1="216" x2="470" y2="290" stroke="black" />
+<line x1="502" y1="216" x2="230" y2="290" stroke="black" />
+<line x1="1072" y1="216" x2="486" y2="290" stroke="black" />
+<line x1="502" y1="216" x2="246" y2="290" stroke="black" />
+<line x1="1072" y1="216" x2="502" y2="290" stroke="black" />
+<line x1="502" y1="216" x2="262" y2="290" stroke="black" />
+<line x1="1072" y1="216" x2="518" y2="290" stroke="black" />
+<line x1="502" y1="216" x2="278" y2="290" stroke="black" />
+<line x1="1072" y1="216" x2="534" y2="290" stroke="black" />
+<line x1="502" y1="216" x2="294" y2="290" stroke="black" />
+<line x1="1072" y1="216" x2="550" y2="290" stroke="black" />
+<line x1="502" y1="216" x2="310" y2="290" stroke="black" />
+<line x1="1072" y1="216" x2="566" y2="290" stroke="black" />
+<line x1="502" y1="216" x2="326" y2="290" stroke="black" />
+<line x1="1072" y1="216" x2="582" y2="290" stroke="black" />
+<line x1="502" y1="216" x2="342" y2="290" stroke="black" />
+<line x1="1072" y1="216" x2="598" y2="290" stroke="black" />
+<line x1="502" y1="216" x2="358" y2="290" stroke="black" />
+<line x1="1072" y1="216" x2="614" y2="290" stroke="black" />
+<line x1="518" y1="216" x2="118" y2="290" stroke="black" />
+<line x1="1088" y1="216" x2="374" y2="290" stroke="black" />
+<line x1="518" y1="216" x2="134" y2="290" stroke="black" />
+<line x1="1088" y1="216" x2="390" y2="290" stroke="black" />
+<line x1="518" y1="216" x2="150" y2="290" stroke="black" />
+<line x1="1088" y1="216" x2="406" y2="290" stroke="black" />
+<line x1="518" y1="216" x2="166" y2="290" stroke="black" />
+<line x1="1088" y1="216" x2="422" y2="290" stroke="black" />
+<line x1="518" y1="216" x2="182" y2="290" stroke="black" />
+<line x1="1088" y1="216" x2="438" y2="290" stroke="black" />
+<line x1="518" y1="216" x2="198" y2="290" stroke="black" />
+<line x1="1088" y1="216" x2="454" y2="290" stroke="black" />
+<line x1="518" y1="216" x2="214" y2="290" stroke="black" />
+<line x1="1088" y1="216" x2="470" y2="290" stroke="black" />
+<line x1="518" y1="216" x2="230" y2="290" stroke="black" />
+<line x1="1088" y1="216" x2="486" y2="290" stroke="black" />
+<line x1="518" y1="216" x2="246" y2="290" stroke="black" />
+<line x1="1088" y1="216" x2="502" y2="290" stroke="black" />
+<line x1="518" y1="216" x2="262" y2="290" stroke="black" />
+<line x1="1088" y1="216" x2="518" y2="290" stroke="black" />
+<line x1="518" y1="216" x2="278" y2="290" stroke="black" />
+<line x1="1088" y1="216" x2="534" y2="290" stroke="black" />
+<line x1="518" y1="216" x2="294" y2="290" stroke="black" />
+<line x1="1088" y1="216" x2="550" y2="290" stroke="black" />
+<line x1="518" y1="216" x2="310" y2="290" stroke="black" />
+<line x1="1088" y1="216" x2="566" y2="290" stroke="black" />
+<line x1="518" y1="216" x2="326" y2="290" stroke="black" />
+<line x1="1088" y1="216" x2="582" y2="290" stroke="black" />
+<line x1="518" y1="216" x2="342" y2="290" stroke="black" />
+<line x1="1088" y1="216" x2="598" y2="290" stroke="black" />
+<line x1="518" y1="216" x2="358" y2="290" stroke="black" />
+<line x1="1088" y1="216" x2="614" y2="290" stroke="black" />
+<line x1="534" y1="216" x2="118" y2="290" stroke="black" />
+<line x1="1104" y1="216" x2="374" y2="290" stroke="black" />
+<line x1="534" y1="216" x2="134" y2="290" stroke="black" />
+<line x1="1104" y1="216" x2="390" y2="290" stroke="black" />
+<line x1="534" y1="216" x2="150" y2="290" stroke="black" />
+<line x1="1104" y1="216" x2="406" y2="290" stroke="black" />
+<line x1="534" y1="216" x2="166" y2="290" stroke="black" />
+<line x1="1104" y1="216" x2="422" y2="290" stroke="black" />
+<line x1="534" y1="216" x2="182" y2="290" stroke="black" />
+<line x1="1104" y1="216" x2="438" y2="290" stroke="black" />
+<line x1="534" y1="216" x2="198" y2="290" stroke="black" />
+<line x1="1104" y1="216" x2="454" y2="290" stroke="black" />
+<line x1="534" y1="216" x2="214" y2="290" stroke="black" />
+<line x1="1104" y1="216" x2="470" y2="290" stroke="black" />
+<line x1="534" y1="216" x2="230" y2="290" stroke="black" />
+<line x1="1104" y1="216" x2="486" y2="290" stroke="black" />
+<line x1="534" y1="216" x2="246" y2="290" stroke="black" />
+<line x1="1104" y1="216" x2="502" y2="290" stroke="black" />
+<line x1="534" y1="216" x2="262" y2="290" stroke="black" />
+<line x1="1104" y1="216" x2="518" y2="290" stroke="black" />
+<line x1="534" y1="216" x2="278" y2="290" stroke="black" />
+<line x1="1104" y1="216" x2="534" y2="290" stroke="black" />
+<line x1="534" y1="216" x2="294" y2="290" stroke="black" />
+<line x1="1104" y1="216" x2="550" y2="290" stroke="black" />
+<line x1="534" y1="216" x2="310" y2="290" stroke="black" />
+<line x1="1104" y1="216" x2="566" y2="290" stroke="black" />
+<line x1="534" y1="216" x2="326" y2="290" stroke="black" />
+<line x1="1104" y1="216" x2="582" y2="290" stroke="black" />
+<line x1="534" y1="216" x2="342" y2="290" stroke="black" />
+<line x1="1104" y1="216" x2="598" y2="290" stroke="black" />
+<line x1="534" y1="216" x2="358" y2="290" stroke="black" />
+<line x1="1104" y1="216" x2="614" y2="290" stroke="black" />
+<line x1="550" y1="216" x2="118" y2="290" stroke="black" />
+<line x1="1120" y1="216" x2="374" y2="290" stroke="black" />
+<line x1="550" y1="216" x2="134" y2="290" stroke="black" />
+<line x1="1120" y1="216" x2="390" y2="290" stroke="black" />
+<line x1="550" y1="216" x2="150" y2="290" stroke="black" />
+<line x1="1120" y1="216" x2="406" y2="290" stroke="black" />
+<line x1="550" y1="216" x2="166" y2="290" stroke="black" />
+<line x1="1120" y1="216" x2="422" y2="290" stroke="black" />
+<line x1="550" y1="216" x2="182" y2="290" stroke="black" />
+<line x1="1120" y1="216" x2="438" y2="290" stroke="black" />
+<line x1="550" y1="216" x2="198" y2="290" stroke="black" />
+<line x1="1120" y1="216" x2="454" y2="290" stroke="black" />
+<line x1="550" y1="216" x2="214" y2="290" stroke="black" />
+<line x1="1120" y1="216" x2="470" y2="290" stroke="black" />
+<line x1="550" y1="216" x2="230" y2="290" stroke="black" />
+<line x1="1120" y1="216" x2="486" y2="290" stroke="black" />
+<line x1="550" y1="216" x2="246" y2="290" stroke="black" />
+<line x1="1120" y1="216" x2="502" y2="290" stroke="black" />
+<line x1="550" y1="216" x2="262" y2="290" stroke="black" />
+<line x1="1120" y1="216" x2="518" y2="290" stroke="black" />
+<line x1="550" y1="216" x2="278" y2="290" stroke="black" />
+<line x1="1120" y1="216" x2="534" y2="290" stroke="black" />
+<line x1="550" y1="216" x2="294" y2="290" stroke="black" />
+<line x1="1120" y1="216" x2="550" y2="290" stroke="black" />
+<line x1="550" y1="216" x2="310" y2="290" stroke="black" />
+<line x1="1120" y1="216" x2="566" y2="290" stroke="black" />
+<line x1="550" y1="216" x2="326" y2="290" stroke="black" />
+<line x1="1120" y1="216" x2="582" y2="290" stroke="black" />
+<line x1="550" y1="216" x2="342" y2="290" stroke="black" />
+<line x1="1120" y1="216" x2="598" y2="290" stroke="black" />
+<line x1="550" y1="216" x2="358" y2="290" stroke="black" />
+<line x1="1120" y1="216" x2="614" y2="290" stroke="black" />
+<line x1="566" y1="216" x2="118" y2="290" stroke="black" />
+<line x1="1136" y1="216" x2="374" y2="290" stroke="black" />
+<line x1="566" y1="216" x2="134" y2="290" stroke="black" />
+<line x1="1136" y1="216" x2="390" y2="290" stroke="black" />
+<line x1="566" y1="216" x2="150" y2="290" stroke="black" />
+<line x1="1136" y1="216" x2="406" y2="290" stroke="black" />
+<line x1="566" y1="216" x2="166" y2="290" stroke="black" />
+<line x1="1136" y1="216" x2="422" y2="290" stroke="black" />
+<line x1="566" y1="216" x2="182" y2="290" stroke="black" />
+<line x1="1136" y1="216" x2="438" y2="290" stroke="black" />
+<line x1="566" y1="216" x2="198" y2="290" stroke="black" />
+<line x1="1136" y1="216" x2="454" y2="290" stroke="black" />
+<line x1="566" y1="216" x2="214" y2="290" stroke="black" />
+<line x1="1136" y1="216" x2="470" y2="290" stroke="black" />
+<line x1="566" y1="216" x2="230" y2="290" stroke="black" />
+<line x1="1136" y1="216" x2="486" y2="290" stroke="black" />
+<line x1="566" y1="216" x2="246" y2="290" stroke="black" />
+<line x1="1136" y1="216" x2="502" y2="290" stroke="black" />
+<line x1="566" y1="216" x2="262" y2="290" stroke="black" />
+<line x1="1136" y1="216" x2="518" y2="290" stroke="black" />
+<line x1="566" y1="216" x2="278" y2="290" stroke="black" />
+<line x1="1136" y1="216" x2="534" y2="290" stroke="black" />
+<line x1="566" y1="216" x2="294" y2="290" stroke="black" />
+<line x1="1136" y1="216" x2="550" y2="290" stroke="black" />
+<line x1="566" y1="216" x2="310" y2="290" stroke="black" />
+<line x1="1136" y1="216" x2="566" y2="290" stroke="black" />
+<line x1="566" y1="216" x2="326" y2="290" stroke="black" />
+<line x1="1136" y1="216" x2="582" y2="290" stroke="black" />
+<line x1="566" y1="216" x2="342" y2="290" stroke="black" />
+<line x1="1136" y1="216" x2="598" y2="290" stroke="black" />
+<line x1="566" y1="216" x2="358" y2="290" stroke="black" />
+<line x1="1136" y1="216" x2="614" y2="290" stroke="black" />
+<line x1="582" y1="216" x2="118" y2="290" stroke="black" />
+<line x1="1152" y1="216" x2="374" y2="290" stroke="black" />
+<line x1="582" y1="216" x2="134" y2="290" stroke="black" />
+<line x1="1152" y1="216" x2="390" y2="290" stroke="black" />
+<line x1="582" y1="216" x2="150" y2="290" stroke="black" />
+<line x1="1152" y1="216" x2="406" y2="290" stroke="black" />
+<line x1="582" y1="216" x2="166" y2="290" stroke="black" />
+<line x1="1152" y1="216" x2="422" y2="290" stroke="black" />
+<line x1="582" y1="216" x2="182" y2="290" stroke="black" />
+<line x1="1152" y1="216" x2="438" y2="290" stroke="black" />
+<line x1="582" y1="216" x2="198" y2="290" stroke="black" />
+<line x1="1152" y1="216" x2="454" y2="290" stroke="black" />
+<line x1="582" y1="216" x2="214" y2="290" stroke="black" />
+<line x1="1152" y1="216" x2="470" y2="290" stroke="black" />
+<line x1="582" y1="216" x2="230" y2="290" stroke="black" />
+<line x1="1152" y1="216" x2="486" y2="290" stroke="black" />
+<line x1="582" y1="216" x2="246" y2="290" stroke="black" />
+<line x1="1152" y1="216" x2="502" y2="290" stroke="black" />
+<line x1="582" y1="216" x2="262" y2="290" stroke="black" />
+<line x1="1152" y1="216" x2="518" y2="290" stroke="black" />
+<line x1="582" y1="216" x2="278" y2="290" stroke="black" />
+<line x1="1152" y1="216" x2="534" y2="290" stroke="black" />
+<line x1="582" y1="216" x2="294" y2="290" stroke="black" />
+<line x1="1152" y1="216" x2="550" y2="290" stroke="black" />
+<line x1="582" y1="216" x2="310" y2="290" stroke="black" />
+<line x1="1152" y1="216" x2="566" y2="290" stroke="black" />
+<line x1="582" y1="216" x2="326" y2="290" stroke="black" />
+<line x1="1152" y1="216" x2="582" y2="290" stroke="black" />
+<line x1="582" y1="216" x2="342" y2="290" stroke="black" />
+<line x1="1152" y1="216" x2="598" y2="290" stroke="black" />
+<line x1="582" y1="216" x2="358" y2="290" stroke="black" />
+<line x1="1152" y1="216" x2="614" y2="290" stroke="black" />
+<line x1="598" y1="216" x2="118" y2="290" stroke="black" />
+<line x1="1168" y1="216" x2="374" y2="290" stroke="black" />
+<line x1="598" y1="216" x2="134" y2="290" stroke="black" />
+<line x1="1168" y1="216" x2="390" y2="290" stroke="black" />
+<line x1="598" y1="216" x2="150" y2="290" stroke="black" />
+<line x1="1168" y1="216" x2="406" y2="290" stroke="black" />
+<line x1="598" y1="216" x2="166" y2="290" stroke="black" />
+<line x1="1168" y1="216" x2="422" y2="290" stroke="black" />
+<line x1="598" y1="216" x2="182" y2="290" stroke="black" />
+<line x1="1168" y1="216" x2="438" y2="290" stroke="black" />
+<line x1="598" y1="216" x2="198" y2="290" stroke="black" />
+<line x1="1168" y1="216" x2="454" y2="290" stroke="black" />
+<line x1="598" y1="216" x2="214" y2="290" stroke="black" />
+<line x1="1168" y1="216" x2="470" y2="290" stroke="black" />
+<line x1="598" y1="216" x2="230" y2="290" stroke="black" />
+<line x1="1168" y1="216" x2="486" y2="290" stroke="black" />
+<line x1="598" y1="216" x2="246" y2="290" stroke="black" />
+<line x1="1168" y1="216" x2="502" y2="290" stroke="black" />
+<line x1="598" y1="216" x2="262" y2="290" stroke="black" />
+<line x1="1168" y1="216" x2="518" y2="290" stroke="black" />
+<line x1="598" y1="216" x2="278" y2="290" stroke="black" />
+<line x1="1168" y1="216" x2="534" y2="290" stroke="black" />
+<line x1="598" y1="216" x2="294" y2="290" stroke="black" />
+<line x1="1168" y1="216" x2="550" y2="290" stroke="black" />
+<line x1="598" y1="216" x2="310" y2="290" stroke="black" />
+<line x1="1168" y1="216" x2="566" y2="290" stroke="black" />
+<line x1="598" y1="216" x2="326" y2="290" stroke="black" />
+<line x1="1168" y1="216" x2="582" y2="290" stroke="black" />
+<line x1="598" y1="216" x2="342" y2="290" stroke="black" />
+<line x1="1168" y1="216" x2="598" y2="290" stroke="black" />
+<line x1="598" y1="216" x2="358" y2="290" stroke="black" />
+<line x1="1168" y1="216" x2="614" y2="290" stroke="black" />
+<line x1="614" y1="216" x2="118" y2="290" stroke="black" />
+<line x1="1184" y1="216" x2="374" y2="290" stroke="black" />
+<line x1="614" y1="216" x2="134" y2="290" stroke="black" />
+<line x1="1184" y1="216" x2="390" y2="290" stroke="black" />
+<line x1="614" y1="216" x2="150" y2="290" stroke="black" />
+<line x1="1184" y1="216" x2="406" y2="290" stroke="black" />
+<line x1="614" y1="216" x2="166" y2="290" stroke="black" />
+<line x1="1184" y1="216" x2="422" y2="290" stroke="black" />
+<line x1="614" y1="216" x2="182" y2="290" stroke="black" />
+<line x1="1184" y1="216" x2="438" y2="290" stroke="black" />
+<line x1="614" y1="216" x2="198" y2="290" stroke="black" />
+<line x1="1184" y1="216" x2="454" y2="290" stroke="black" />
+<line x1="614" y1="216" x2="214" y2="290" stroke="black" />
+<line x1="1184" y1="216" x2="470" y2="290" stroke="black" />
+<line x1="614" y1="216" x2="230" y2="290" stroke="black" />
+<line x1="1184" y1="216" x2="486" y2="290" stroke="black" />
+<line x1="614" y1="216" x2="246" y2="290" stroke="black" />
+<line x1="1184" y1="216" x2="502" y2="290" stroke="black" />
+<line x1="614" y1="216" x2="262" y2="290" stroke="black" />
+<line x1="1184" y1="216" x2="518" y2="290" stroke="black" />
+<line x1="614" y1="216" x2="278" y2="290" stroke="black" />
+<line x1="1184" y1="216" x2="534" y2="290" stroke="black" />
+<line x1="614" y1="216" x2="294" y2="290" stroke="black" />
+<line x1="1184" y1="216" x2="550" y2="290" stroke="black" />
+<line x1="614" y1="216" x2="310" y2="290" stroke="black" />
+<line x1="1184" y1="216" x2="566" y2="290" stroke="black" />
+<line x1="614" y1="216" x2="326" y2="290" stroke="black" />
+<line x1="1184" y1="216" x2="582" y2="290" stroke="black" />
+<line x1="614" y1="216" x2="342" y2="290" stroke="black" />
+<line x1="1184" y1="216" x2="598" y2="290" stroke="black" />
+<line x1="614" y1="216" x2="358" y2="290" stroke="black" />
+<line x1="1184" y1="216" x2="614" y2="290" stroke="black" />
+<line x1="118" y1="316" x2="118" y2="390" stroke="black" />
+<line x1="134" y1="316" x2="134" y2="390" stroke="black" />
+<line x1="150" y1="316" x2="150" y2="390" stroke="black" />
+<line x1="166" y1="316" x2="166" y2="390" stroke="black" />
+<line x1="182" y1="316" x2="182" y2="390" stroke="black" />
+<line x1="198" y1="316" x2="198" y2="390" stroke="black" />
+<line x1="214" y1="316" x2="214" y2="390" stroke="black" />
+<line x1="230" y1="316" x2="230" y2="390" stroke="black" />
+<line x1="246" y1="316" x2="246" y2="390" stroke="black" />
+<line x1="262" y1="316" x2="262" y2="390" stroke="black" />
+<line x1="278" y1="316" x2="278" y2="390" stroke="black" />
+<line x1="294" y1="316" x2="294" y2="390" stroke="black" />
+<line x1="310" y1="316" x2="310" y2="390" stroke="black" />
+<line x1="326" y1="316" x2="326" y2="390" stroke="black" />
+<line x1="342" y1="316" x2="342" y2="390" stroke="black" />
+<line x1="358" y1="316" x2="358" y2="390" stroke="black" />
+<line x1="374" y1="316" x2="374" y2="390" stroke="black" />
+<line x1="390" y1="316" x2="390" y2="390" stroke="black" />
+<line x1="406" y1="316" x2="406" y2="390" stroke="black" />
+<line x1="422" y1="316" x2="422" y2="390" stroke="black" />
+<line x1="438" y1="316" x2="438" y2="390" stroke="black" />
+<line x1="454" y1="316" x2="454" y2="390" stroke="black" />
+<line x1="470" y1="316" x2="470" y2="390" stroke="black" />
+<line x1="486" y1="316" x2="486" y2="390" stroke="black" />
+<line x1="502" y1="316" x2="502" y2="390" stroke="black" />
+<line x1="518" y1="316" x2="518" y2="390" stroke="black" />
+<line x1="534" y1="316" x2="534" y2="390" stroke="black" />
+<line x1="550" y1="316" x2="550" y2="390" stroke="black" />
+<line x1="566" y1="316" x2="566" y2="390" stroke="black" />
+<line x1="582" y1="316" x2="582" y2="390" stroke="black" />
+<line x1="598" y1="316" x2="598" y2="390" stroke="black" />
+<line x1="614" y1="316" x2="614" y2="390" stroke="black" />
+</svg>
diff --git a/diagram/xvshuf_d.svg b/diagram/xvshuf_d.svg
new file mode 100644
index 00000000..835b9838
--- /dev/null
+++ b/diagram/xvshuf_d.svg
@@ -0,0 +1,92 @@
+<svg version="1.1"
+     width="1240" height="450"
+     xmlns="http://www.w3.org/2000/svg">
+<text x="20" y="100">(1)</text>
+<text x="100" y="100" text-anchor="end" font-style="italic">b</text>
+<text x="100" y="115" text-anchor="end">data</text>
+<rect x="110" y="90" width="128" height="26" fill="white" stroke="blue" />
+<text x="174" y="103" text-anchor="middle" dominant-baseline="middle">3</text>
+<rect x="238" y="90" width="128" height="26" fill="white" stroke="blue" />
+<text x="302" y="103" text-anchor="middle" dominant-baseline="middle">2</text>
+<rect x="366" y="90" width="128" height="26" fill="white" stroke="blue" />
+<text x="430" y="103" text-anchor="middle" dominant-baseline="middle">3</text>
+<rect x="494" y="90" width="128" height="26" fill="white" stroke="blue" />
+<text x="558" y="103" text-anchor="middle" dominant-baseline="middle">2</text>
+<text x="110" y="80" text-anchor="begin">upper</text>
+<text x="622" y="80" text-anchor="end">lower</text>
+<text x="670" y="100" text-anchor="end" font-style="italic">c</text>
+<text x="670" y="115" text-anchor="end">data</text>
+<rect x="680" y="90" width="128" height="26" fill="white" stroke="blue" />
+<text x="744" y="103" text-anchor="middle" dominant-baseline="middle">1</text>
+<rect x="808" y="90" width="128" height="26" fill="white" stroke="blue" />
+<text x="872" y="103" text-anchor="middle" dominant-baseline="middle">0</text>
+<rect x="936" y="90" width="128" height="26" fill="white" stroke="blue" />
+<text x="1000" y="103" text-anchor="middle" dominant-baseline="middle">1</text>
+<rect x="1064" y="90" width="128" height="26" fill="white" stroke="blue" />
+<text x="1128" y="103" text-anchor="middle" dominant-baseline="middle">0</text>
+<text x="680" y="80" text-anchor="begin">upper</text>
+<text x="1192" y="80" text-anchor="end">lower</text>
+<text x="20" y="200">(2)</text>
+<text x="100" y="200" text-anchor="end" font-style="italic">hi</text>
+<text x="100" y="215" text-anchor="end">merged</text>
+<rect x="110" y="190" width="128" height="26" fill="white" stroke="blue" />
+<text x="174" y="203" text-anchor="middle" dominant-baseline="middle">3</text>
+<rect x="238" y="190" width="128" height="26" fill="white" stroke="blue" />
+<text x="302" y="203" text-anchor="middle" dominant-baseline="middle">2</text>
+<rect x="366" y="190" width="128" height="26" fill="white" stroke="blue" />
+<text x="430" y="203" text-anchor="middle" dominant-baseline="middle">1</text>
+<rect x="494" y="190" width="128" height="26" fill="white" stroke="blue" />
+<text x="558" y="203" text-anchor="middle" dominant-baseline="middle">0</text>
+<text x="670" y="200" text-anchor="end" font-style="italic">lo</text>
+<text x="670" y="215" text-anchor="end">merged</text>
+<rect x="680" y="190" width="128" height="26" fill="white" stroke="blue" />
+<text x="744" y="203" text-anchor="middle" dominant-baseline="middle">3</text>
+<rect x="808" y="190" width="128" height="26" fill="white" stroke="blue" />
+<text x="872" y="203" text-anchor="middle" dominant-baseline="middle">2</text>
+<rect x="936" y="190" width="128" height="26" fill="white" stroke="blue" />
+<text x="1000" y="203" text-anchor="middle" dominant-baseline="middle">1</text>
+<rect x="1064" y="190" width="128" height="26" fill="white" stroke="blue" />
+<text x="1128" y="203" text-anchor="middle" dominant-baseline="middle">0</text>
+<text x="20" y="300">(3)</text>
+<text x="100" y="300" text-anchor="end" font-style="italic">a</text>
+<text x="100" y="315" text-anchor="end">indices</text>
+<rect x="110" y="290" width="128" height="26" fill="white" stroke="blue" />
+<rect x="238" y="290" width="128" height="26" fill="white" stroke="blue" />
+<rect x="366" y="290" width="128" height="26" fill="white" stroke="blue" />
+<rect x="494" y="290" width="128" height="26" fill="white" stroke="blue" />
+<text x="20" y="400">(4)</text>
+<text x="100" y="400" text-anchor="end" font-style="italic">ret</text>
+<text x="100" y="415" text-anchor="end">returns</text>
+<rect x="110" y="390" width="128" height="26" fill="white" stroke="blue" />
+<rect x="238" y="390" width="128" height="26" fill="white" stroke="blue" />
+<rect x="366" y="390" width="128" height="26" fill="white" stroke="blue" />
+<rect x="494" y="390" width="128" height="26" fill="white" stroke="blue" />
+<line x1="174" y1="116" x2="174" y2="190" stroke="black" />
+<line x1="430" y1="116" x2="744" y2="190" stroke="black" />
+<line x1="302" y1="116" x2="302" y2="190" stroke="black" />
+<line x1="558" y1="116" x2="872" y2="190" stroke="black" />
+<line x1="744" y1="116" x2="430" y2="190" stroke="black" />
+<line x1="1000" y1="116" x2="1000" y2="190" stroke="black" />
+<line x1="872" y1="116" x2="558" y2="190" stroke="black" />
+<line x1="1128" y1="116" x2="1128" y2="190" stroke="black" />
+<line x1="174" y1="216" x2="174" y2="290" stroke="black" />
+<line x1="744" y1="216" x2="430" y2="290" stroke="black" />
+<line x1="174" y1="216" x2="302" y2="290" stroke="black" />
+<line x1="744" y1="216" x2="558" y2="290" stroke="black" />
+<line x1="302" y1="216" x2="174" y2="290" stroke="black" />
+<line x1="872" y1="216" x2="430" y2="290" stroke="black" />
+<line x1="302" y1="216" x2="302" y2="290" stroke="black" />
+<line x1="872" y1="216" x2="558" y2="290" stroke="black" />
+<line x1="430" y1="216" x2="174" y2="290" stroke="black" />
+<line x1="1000" y1="216" x2="430" y2="290" stroke="black" />
+<line x1="430" y1="216" x2="302" y2="290" stroke="black" />
+<line x1="1000" y1="216" x2="558" y2="290" stroke="black" />
+<line x1="558" y1="216" x2="174" y2="290" stroke="black" />
+<line x1="1128" y1="216" x2="430" y2="290" stroke="black" />
+<line x1="558" y1="216" x2="302" y2="290" stroke="black" />
+<line x1="1128" y1="216" x2="558" y2="290" stroke="black" />
+<line x1="174" y1="316" x2="174" y2="390" stroke="black" />
+<line x1="302" y1="316" x2="302" y2="390" stroke="black" />
+<line x1="430" y1="316" x2="430" y2="390" stroke="black" />
+<line x1="558" y1="316" x2="558" y2="390" stroke="black" />
+</svg>
diff --git a/diagram/xvshuf_h.svg b/diagram/xvshuf_h.svg
new file mode 100644
index 00000000..731a277e
--- /dev/null
+++ b/diagram/xvshuf_h.svg
@@ -0,0 +1,488 @@
+<svg version="1.1"
+     width="1240" height="450"
+     xmlns="http://www.w3.org/2000/svg">
+<text x="20" y="100">(1)</text>
+<text x="100" y="100" text-anchor="end" font-style="italic">b</text>
+<text x="100" y="115" text-anchor="end">data</text>
+<rect x="110" y="90" width="32" height="26" fill="white" stroke="blue" />
+<text x="126" y="103" text-anchor="middle" dominant-baseline="middle">15</text>
+<rect x="142" y="90" width="32" height="26" fill="white" stroke="blue" />
+<text x="158" y="103" text-anchor="middle" dominant-baseline="middle">14</text>
+<rect x="174" y="90" width="32" height="26" fill="white" stroke="blue" />
+<text x="190" y="103" text-anchor="middle" dominant-baseline="middle">13</text>
+<rect x="206" y="90" width="32" height="26" fill="white" stroke="blue" />
+<text x="222" y="103" text-anchor="middle" dominant-baseline="middle">12</text>
+<rect x="238" y="90" width="32" height="26" fill="white" stroke="blue" />
+<text x="254" y="103" text-anchor="middle" dominant-baseline="middle">11</text>
+<rect x="270" y="90" width="32" height="26" fill="white" stroke="blue" />
+<text x="286" y="103" text-anchor="middle" dominant-baseline="middle">10</text>
+<rect x="302" y="90" width="32" height="26" fill="white" stroke="blue" />
+<text x="318" y="103" text-anchor="middle" dominant-baseline="middle">9</text>
+<rect x="334" y="90" width="32" height="26" fill="white" stroke="blue" />
+<text x="350" y="103" text-anchor="middle" dominant-baseline="middle">8</text>
+<rect x="366" y="90" width="32" height="26" fill="white" stroke="blue" />
+<text x="382" y="103" text-anchor="middle" dominant-baseline="middle">15</text>
+<rect x="398" y="90" width="32" height="26" fill="white" stroke="blue" />
+<text x="414" y="103" text-anchor="middle" dominant-baseline="middle">14</text>
+<rect x="430" y="90" width="32" height="26" fill="white" stroke="blue" />
+<text x="446" y="103" text-anchor="middle" dominant-baseline="middle">13</text>
+<rect x="462" y="90" width="32" height="26" fill="white" stroke="blue" />
+<text x="478" y="103" text-anchor="middle" dominant-baseline="middle">12</text>
+<rect x="494" y="90" width="32" height="26" fill="white" stroke="blue" />
+<text x="510" y="103" text-anchor="middle" dominant-baseline="middle">11</text>
+<rect x="526" y="90" width="32" height="26" fill="white" stroke="blue" />
+<text x="542" y="103" text-anchor="middle" dominant-baseline="middle">10</text>
+<rect x="558" y="90" width="32" height="26" fill="white" stroke="blue" />
+<text x="574" y="103" text-anchor="middle" dominant-baseline="middle">9</text>
+<rect x="590" y="90" width="32" height="26" fill="white" stroke="blue" />
+<text x="606" y="103" text-anchor="middle" dominant-baseline="middle">8</text>
+<text x="110" y="80" text-anchor="begin">upper</text>
+<text x="622" y="80" text-anchor="end">lower</text>
+<text x="670" y="100" text-anchor="end" font-style="italic">c</text>
+<text x="670" y="115" text-anchor="end">data</text>
+<rect x="680" y="90" width="32" height="26" fill="white" stroke="blue" />
+<text x="696" y="103" text-anchor="middle" dominant-baseline="middle">7</text>
+<rect x="712" y="90" width="32" height="26" fill="white" stroke="blue" />
+<text x="728" y="103" text-anchor="middle" dominant-baseline="middle">6</text>
+<rect x="744" y="90" width="32" height="26" fill="white" stroke="blue" />
+<text x="760" y="103" text-anchor="middle" dominant-baseline="middle">5</text>
+<rect x="776" y="90" width="32" height="26" fill="white" stroke="blue" />
+<text x="792" y="103" text-anchor="middle" dominant-baseline="middle">4</text>
+<rect x="808" y="90" width="32" height="26" fill="white" stroke="blue" />
+<text x="824" y="103" text-anchor="middle" dominant-baseline="middle">3</text>
+<rect x="840" y="90" width="32" height="26" fill="white" stroke="blue" />
+<text x="856" y="103" text-anchor="middle" dominant-baseline="middle">2</text>
+<rect x="872" y="90" width="32" height="26" fill="white" stroke="blue" />
+<text x="888" y="103" text-anchor="middle" dominant-baseline="middle">1</text>
+<rect x="904" y="90" width="32" height="26" fill="white" stroke="blue" />
+<text x="920" y="103" text-anchor="middle" dominant-baseline="middle">0</text>
+<rect x="936" y="90" width="32" height="26" fill="white" stroke="blue" />
+<text x="952" y="103" text-anchor="middle" dominant-baseline="middle">7</text>
+<rect x="968" y="90" width="32" height="26" fill="white" stroke="blue" />
+<text x="984" y="103" text-anchor="middle" dominant-baseline="middle">6</text>
+<rect x="1000" y="90" width="32" height="26" fill="white" stroke="blue" />
+<text x="1016" y="103" text-anchor="middle" dominant-baseline="middle">5</text>
+<rect x="1032" y="90" width="32" height="26" fill="white" stroke="blue" />
+<text x="1048" y="103" text-anchor="middle" dominant-baseline="middle">4</text>
+<rect x="1064" y="90" width="32" height="26" fill="white" stroke="blue" />
+<text x="1080" y="103" text-anchor="middle" dominant-baseline="middle">3</text>
+<rect x="1096" y="90" width="32" height="26" fill="white" stroke="blue" />
+<text x="1112" y="103" text-anchor="middle" dominant-baseline="middle">2</text>
+<rect x="1128" y="90" width="32" height="26" fill="white" stroke="blue" />
+<text x="1144" y="103" text-anchor="middle" dominant-baseline="middle">1</text>
+<rect x="1160" y="90" width="32" height="26" fill="white" stroke="blue" />
+<text x="1176" y="103" text-anchor="middle" dominant-baseline="middle">0</text>
+<text x="680" y="80" text-anchor="begin">upper</text>
+<text x="1192" y="80" text-anchor="end">lower</text>
+<text x="20" y="200">(2)</text>
+<text x="100" y="200" text-anchor="end" font-style="italic">hi</text>
+<text x="100" y="215" text-anchor="end">merged</text>
+<rect x="110" y="190" width="32" height="26" fill="white" stroke="blue" />
+<text x="126" y="203" text-anchor="middle" dominant-baseline="middle">15</text>
+<rect x="142" y="190" width="32" height="26" fill="white" stroke="blue" />
+<text x="158" y="203" text-anchor="middle" dominant-baseline="middle">14</text>
+<rect x="174" y="190" width="32" height="26" fill="white" stroke="blue" />
+<text x="190" y="203" text-anchor="middle" dominant-baseline="middle">13</text>
+<rect x="206" y="190" width="32" height="26" fill="white" stroke="blue" />
+<text x="222" y="203" text-anchor="middle" dominant-baseline="middle">12</text>
+<rect x="238" y="190" width="32" height="26" fill="white" stroke="blue" />
+<text x="254" y="203" text-anchor="middle" dominant-baseline="middle">11</text>
+<rect x="270" y="190" width="32" height="26" fill="white" stroke="blue" />
+<text x="286" y="203" text-anchor="middle" dominant-baseline="middle">10</text>
+<rect x="302" y="190" width="32" height="26" fill="white" stroke="blue" />
+<text x="318" y="203" text-anchor="middle" dominant-baseline="middle">9</text>
+<rect x="334" y="190" width="32" height="26" fill="white" stroke="blue" />
+<text x="350" y="203" text-anchor="middle" dominant-baseline="middle">8</text>
+<rect x="366" y="190" width="32" height="26" fill="white" stroke="blue" />
+<text x="382" y="203" text-anchor="middle" dominant-baseline="middle">7</text>
+<rect x="398" y="190" width="32" height="26" fill="white" stroke="blue" />
+<text x="414" y="203" text-anchor="middle" dominant-baseline="middle">6</text>
+<rect x="430" y="190" width="32" height="26" fill="white" stroke="blue" />
+<text x="446" y="203" text-anchor="middle" dominant-baseline="middle">5</text>
+<rect x="462" y="190" width="32" height="26" fill="white" stroke="blue" />
+<text x="478" y="203" text-anchor="middle" dominant-baseline="middle">4</text>
+<rect x="494" y="190" width="32" height="26" fill="white" stroke="blue" />
+<text x="510" y="203" text-anchor="middle" dominant-baseline="middle">3</text>
+<rect x="526" y="190" width="32" height="26" fill="white" stroke="blue" />
+<text x="542" y="203" text-anchor="middle" dominant-baseline="middle">2</text>
+<rect x="558" y="190" width="32" height="26" fill="white" stroke="blue" />
+<text x="574" y="203" text-anchor="middle" dominant-baseline="middle">1</text>
+<rect x="590" y="190" width="32" height="26" fill="white" stroke="blue" />
+<text x="606" y="203" text-anchor="middle" dominant-baseline="middle">0</text>
+<text x="670" y="200" text-anchor="end" font-style="italic">lo</text>
+<text x="670" y="215" text-anchor="end">merged</text>
+<rect x="680" y="190" width="32" height="26" fill="white" stroke="blue" />
+<text x="696" y="203" text-anchor="middle" dominant-baseline="middle">15</text>
+<rect x="712" y="190" width="32" height="26" fill="white" stroke="blue" />
+<text x="728" y="203" text-anchor="middle" dominant-baseline="middle">14</text>
+<rect x="744" y="190" width="32" height="26" fill="white" stroke="blue" />
+<text x="760" y="203" text-anchor="middle" dominant-baseline="middle">13</text>
+<rect x="776" y="190" width="32" height="26" fill="white" stroke="blue" />
+<text x="792" y="203" text-anchor="middle" dominant-baseline="middle">12</text>
+<rect x="808" y="190" width="32" height="26" fill="white" stroke="blue" />
+<text x="824" y="203" text-anchor="middle" dominant-baseline="middle">11</text>
+<rect x="840" y="190" width="32" height="26" fill="white" stroke="blue" />
+<text x="856" y="203" text-anchor="middle" dominant-baseline="middle">10</text>
+<rect x="872" y="190" width="32" height="26" fill="white" stroke="blue" />
+<text x="888" y="203" text-anchor="middle" dominant-baseline="middle">9</text>
+<rect x="904" y="190" width="32" height="26" fill="white" stroke="blue" />
+<text x="920" y="203" text-anchor="middle" dominant-baseline="middle">8</text>
+<rect x="936" y="190" width="32" height="26" fill="white" stroke="blue" />
+<text x="952" y="203" text-anchor="middle" dominant-baseline="middle">7</text>
+<rect x="968" y="190" width="32" height="26" fill="white" stroke="blue" />
+<text x="984" y="203" text-anchor="middle" dominant-baseline="middle">6</text>
+<rect x="1000" y="190" width="32" height="26" fill="white" stroke="blue" />
+<text x="1016" y="203" text-anchor="middle" dominant-baseline="middle">5</text>
+<rect x="1032" y="190" width="32" height="26" fill="white" stroke="blue" />
+<text x="1048" y="203" text-anchor="middle" dominant-baseline="middle">4</text>
+<rect x="1064" y="190" width="32" height="26" fill="white" stroke="blue" />
+<text x="1080" y="203" text-anchor="middle" dominant-baseline="middle">3</text>
+<rect x="1096" y="190" width="32" height="26" fill="white" stroke="blue" />
+<text x="1112" y="203" text-anchor="middle" dominant-baseline="middle">2</text>
+<rect x="1128" y="190" width="32" height="26" fill="white" stroke="blue" />
+<text x="1144" y="203" text-anchor="middle" dominant-baseline="middle">1</text>
+<rect x="1160" y="190" width="32" height="26" fill="white" stroke="blue" />
+<text x="1176" y="203" text-anchor="middle" dominant-baseline="middle">0</text>
+<text x="20" y="300">(3)</text>
+<text x="100" y="300" text-anchor="end" font-style="italic">a</text>
+<text x="100" y="315" text-anchor="end">indices</text>
+<rect x="110" y="290" width="32" height="26" fill="white" stroke="blue" />
+<rect x="142" y="290" width="32" height="26" fill="white" stroke="blue" />
+<rect x="174" y="290" width="32" height="26" fill="white" stroke="blue" />
+<rect x="206" y="290" width="32" height="26" fill="white" stroke="blue" />
+<rect x="238" y="290" width="32" height="26" fill="white" stroke="blue" />
+<rect x="270" y="290" width="32" height="26" fill="white" stroke="blue" />
+<rect x="302" y="290" width="32" height="26" fill="white" stroke="blue" />
+<rect x="334" y="290" width="32" height="26" fill="white" stroke="blue" />
+<rect x="366" y="290" width="32" height="26" fill="white" stroke="blue" />
+<rect x="398" y="290" width="32" height="26" fill="white" stroke="blue" />
+<rect x="430" y="290" width="32" height="26" fill="white" stroke="blue" />
+<rect x="462" y="290" width="32" height="26" fill="white" stroke="blue" />
+<rect x="494" y="290" width="32" height="26" fill="white" stroke="blue" />
+<rect x="526" y="290" width="32" height="26" fill="white" stroke="blue" />
+<rect x="558" y="290" width="32" height="26" fill="white" stroke="blue" />
+<rect x="590" y="290" width="32" height="26" fill="white" stroke="blue" />
+<text x="20" y="400">(4)</text>
+<text x="100" y="400" text-anchor="end" font-style="italic">ret</text>
+<text x="100" y="415" text-anchor="end">returns</text>
+<rect x="110" y="390" width="32" height="26" fill="white" stroke="blue" />
+<rect x="142" y="390" width="32" height="26" fill="white" stroke="blue" />
+<rect x="174" y="390" width="32" height="26" fill="white" stroke="blue" />
+<rect x="206" y="390" width="32" height="26" fill="white" stroke="blue" />
+<rect x="238" y="390" width="32" height="26" fill="white" stroke="blue" />
+<rect x="270" y="390" width="32" height="26" fill="white" stroke="blue" />
+<rect x="302" y="390" width="32" height="26" fill="white" stroke="blue" />
+<rect x="334" y="390" width="32" height="26" fill="white" stroke="blue" />
+<rect x="366" y="390" width="32" height="26" fill="white" stroke="blue" />
+<rect x="398" y="390" width="32" height="26" fill="white" stroke="blue" />
+<rect x="430" y="390" width="32" height="26" fill="white" stroke="blue" />
+<rect x="462" y="390" width="32" height="26" fill="white" stroke="blue" />
+<rect x="494" y="390" width="32" height="26" fill="white" stroke="blue" />
+<rect x="526" y="390" width="32" height="26" fill="white" stroke="blue" />
+<rect x="558" y="390" width="32" height="26" fill="white" stroke="blue" />
+<rect x="590" y="390" width="32" height="26" fill="white" stroke="blue" />
+<line x1="126" y1="116" x2="126" y2="190" stroke="black" />
+<line x1="382" y1="116" x2="696" y2="190" stroke="black" />
+<line x1="158" y1="116" x2="158" y2="190" stroke="black" />
+<line x1="414" y1="116" x2="728" y2="190" stroke="black" />
+<line x1="190" y1="116" x2="190" y2="190" stroke="black" />
+<line x1="446" y1="116" x2="760" y2="190" stroke="black" />
+<line x1="222" y1="116" x2="222" y2="190" stroke="black" />
+<line x1="478" y1="116" x2="792" y2="190" stroke="black" />
+<line x1="254" y1="116" x2="254" y2="190" stroke="black" />
+<line x1="510" y1="116" x2="824" y2="190" stroke="black" />
+<line x1="286" y1="116" x2="286" y2="190" stroke="black" />
+<line x1="542" y1="116" x2="856" y2="190" stroke="black" />
+<line x1="318" y1="116" x2="318" y2="190" stroke="black" />
+<line x1="574" y1="116" x2="888" y2="190" stroke="black" />
+<line x1="350" y1="116" x2="350" y2="190" stroke="black" />
+<line x1="606" y1="116" x2="920" y2="190" stroke="black" />
+<line x1="696" y1="116" x2="382" y2="190" stroke="black" />
+<line x1="952" y1="116" x2="952" y2="190" stroke="black" />
+<line x1="728" y1="116" x2="414" y2="190" stroke="black" />
+<line x1="984" y1="116" x2="984" y2="190" stroke="black" />
+<line x1="760" y1="116" x2="446" y2="190" stroke="black" />
+<line x1="1016" y1="116" x2="1016" y2="190" stroke="black" />
+<line x1="792" y1="116" x2="478" y2="190" stroke="black" />
+<line x1="1048" y1="116" x2="1048" y2="190" stroke="black" />
+<line x1="824" y1="116" x2="510" y2="190" stroke="black" />
+<line x1="1080" y1="116" x2="1080" y2="190" stroke="black" />
+<line x1="856" y1="116" x2="542" y2="190" stroke="black" />
+<line x1="1112" y1="116" x2="1112" y2="190" stroke="black" />
+<line x1="888" y1="116" x2="574" y2="190" stroke="black" />
+<line x1="1144" y1="116" x2="1144" y2="190" stroke="black" />
+<line x1="920" y1="116" x2="606" y2="190" stroke="black" />
+<line x1="1176" y1="116" x2="1176" y2="190" stroke="black" />
+<line x1="126" y1="216" x2="126" y2="290" stroke="black" />
+<line x1="696" y1="216" x2="382" y2="290" stroke="black" />
+<line x1="126" y1="216" x2="158" y2="290" stroke="black" />
+<line x1="696" y1="216" x2="414" y2="290" stroke="black" />
+<line x1="126" y1="216" x2="190" y2="290" stroke="black" />
+<line x1="696" y1="216" x2="446" y2="290" stroke="black" />
+<line x1="126" y1="216" x2="222" y2="290" stroke="black" />
+<line x1="696" y1="216" x2="478" y2="290" stroke="black" />
+<line x1="126" y1="216" x2="254" y2="290" stroke="black" />
+<line x1="696" y1="216" x2="510" y2="290" stroke="black" />
+<line x1="126" y1="216" x2="286" y2="290" stroke="black" />
+<line x1="696" y1="216" x2="542" y2="290" stroke="black" />
+<line x1="126" y1="216" x2="318" y2="290" stroke="black" />
+<line x1="696" y1="216" x2="574" y2="290" stroke="black" />
+<line x1="126" y1="216" x2="350" y2="290" stroke="black" />
+<line x1="696" y1="216" x2="606" y2="290" stroke="black" />
+<line x1="158" y1="216" x2="126" y2="290" stroke="black" />
+<line x1="728" y1="216" x2="382" y2="290" stroke="black" />
+<line x1="158" y1="216" x2="158" y2="290" stroke="black" />
+<line x1="728" y1="216" x2="414" y2="290" stroke="black" />
+<line x1="158" y1="216" x2="190" y2="290" stroke="black" />
+<line x1="728" y1="216" x2="446" y2="290" stroke="black" />
+<line x1="158" y1="216" x2="222" y2="290" stroke="black" />
+<line x1="728" y1="216" x2="478" y2="290" stroke="black" />
+<line x1="158" y1="216" x2="254" y2="290" stroke="black" />
+<line x1="728" y1="216" x2="510" y2="290" stroke="black" />
+<line x1="158" y1="216" x2="286" y2="290" stroke="black" />
+<line x1="728" y1="216" x2="542" y2="290" stroke="black" />
+<line x1="158" y1="216" x2="318" y2="290" stroke="black" />
+<line x1="728" y1="216" x2="574" y2="290" stroke="black" />
+<line x1="158" y1="216" x2="350" y2="290" stroke="black" />
+<line x1="728" y1="216" x2="606" y2="290" stroke="black" />
+<line x1="190" y1="216" x2="126" y2="290" stroke="black" />
+<line x1="760" y1="216" x2="382" y2="290" stroke="black" />
+<line x1="190" y1="216" x2="158" y2="290" stroke="black" />
+<line x1="760" y1="216" x2="414" y2="290" stroke="black" />
+<line x1="190" y1="216" x2="190" y2="290" stroke="black" />
+<line x1="760" y1="216" x2="446" y2="290" stroke="black" />
+<line x1="190" y1="216" x2="222" y2="290" stroke="black" />
+<line x1="760" y1="216" x2="478" y2="290" stroke="black" />
+<line x1="190" y1="216" x2="254" y2="290" stroke="black" />
+<line x1="760" y1="216" x2="510" y2="290" stroke="black" />
+<line x1="190" y1="216" x2="286" y2="290" stroke="black" />
+<line x1="760" y1="216" x2="542" y2="290" stroke="black" />
+<line x1="190" y1="216" x2="318" y2="290" stroke="black" />
+<line x1="760" y1="216" x2="574" y2="290" stroke="black" />
+<line x1="190" y1="216" x2="350" y2="290" stroke="black" />
+<line x1="760" y1="216" x2="606" y2="290" stroke="black" />
+<line x1="222" y1="216" x2="126" y2="290" stroke="black" />
+<line x1="792" y1="216" x2="382" y2="290" stroke="black" />
+<line x1="222" y1="216" x2="158" y2="290" stroke="black" />
+<line x1="792" y1="216" x2="414" y2="290" stroke="black" />
+<line x1="222" y1="216" x2="190" y2="290" stroke="black" />
+<line x1="792" y1="216" x2="446" y2="290" stroke="black" />
+<line x1="222" y1="216" x2="222" y2="290" stroke="black" />
+<line x1="792" y1="216" x2="478" y2="290" stroke="black" />
+<line x1="222" y1="216" x2="254" y2="290" stroke="black" />
+<line x1="792" y1="216" x2="510" y2="290" stroke="black" />
+<line x1="222" y1="216" x2="286" y2="290" stroke="black" />
+<line x1="792" y1="216" x2="542" y2="290" stroke="black" />
+<line x1="222" y1="216" x2="318" y2="290" stroke="black" />
+<line x1="792" y1="216" x2="574" y2="290" stroke="black" />
+<line x1="222" y1="216" x2="350" y2="290" stroke="black" />
+<line x1="792" y1="216" x2="606" y2="290" stroke="black" />
+<line x1="254" y1="216" x2="126" y2="290" stroke="black" />
+<line x1="824" y1="216" x2="382" y2="290" stroke="black" />
+<line x1="254" y1="216" x2="158" y2="290" stroke="black" />
+<line x1="824" y1="216" x2="414" y2="290" stroke="black" />
+<line x1="254" y1="216" x2="190" y2="290" stroke="black" />
+<line x1="824" y1="216" x2="446" y2="290" stroke="black" />
+<line x1="254" y1="216" x2="222" y2="290" stroke="black" />
+<line x1="824" y1="216" x2="478" y2="290" stroke="black" />
+<line x1="254" y1="216" x2="254" y2="290" stroke="black" />
+<line x1="824" y1="216" x2="510" y2="290" stroke="black" />
+<line x1="254" y1="216" x2="286" y2="290" stroke="black" />
+<line x1="824" y1="216" x2="542" y2="290" stroke="black" />
+<line x1="254" y1="216" x2="318" y2="290" stroke="black" />
+<line x1="824" y1="216" x2="574" y2="290" stroke="black" />
+<line x1="254" y1="216" x2="350" y2="290" stroke="black" />
+<line x1="824" y1="216" x2="606" y2="290" stroke="black" />
+<line x1="286" y1="216" x2="126" y2="290" stroke="black" />
+<line x1="856" y1="216" x2="382" y2="290" stroke="black" />
+<line x1="286" y1="216" x2="158" y2="290" stroke="black" />
+<line x1="856" y1="216" x2="414" y2="290" stroke="black" />
+<line x1="286" y1="216" x2="190" y2="290" stroke="black" />
+<line x1="856" y1="216" x2="446" y2="290" stroke="black" />
+<line x1="286" y1="216" x2="222" y2="290" stroke="black" />
+<line x1="856" y1="216" x2="478" y2="290" stroke="black" />
+<line x1="286" y1="216" x2="254" y2="290" stroke="black" />
+<line x1="856" y1="216" x2="510" y2="290" stroke="black" />
+<line x1="286" y1="216" x2="286" y2="290" stroke="black" />
+<line x1="856" y1="216" x2="542" y2="290" stroke="black" />
+<line x1="286" y1="216" x2="318" y2="290" stroke="black" />
+<line x1="856" y1="216" x2="574" y2="290" stroke="black" />
+<line x1="286" y1="216" x2="350" y2="290" stroke="black" />
+<line x1="856" y1="216" x2="606" y2="290" stroke="black" />
+<line x1="318" y1="216" x2="126" y2="290" stroke="black" />
+<line x1="888" y1="216" x2="382" y2="290" stroke="black" />
+<line x1="318" y1="216" x2="158" y2="290" stroke="black" />
+<line x1="888" y1="216" x2="414" y2="290" stroke="black" />
+<line x1="318" y1="216" x2="190" y2="290" stroke="black" />
+<line x1="888" y1="216" x2="446" y2="290" stroke="black" />
+<line x1="318" y1="216" x2="222" y2="290" stroke="black" />
+<line x1="888" y1="216" x2="478" y2="290" stroke="black" />
+<line x1="318" y1="216" x2="254" y2="290" stroke="black" />
+<line x1="888" y1="216" x2="510" y2="290" stroke="black" />
+<line x1="318" y1="216" x2="286" y2="290" stroke="black" />
+<line x1="888" y1="216" x2="542" y2="290" stroke="black" />
+<line x1="318" y1="216" x2="318" y2="290" stroke="black" />
+<line x1="888" y1="216" x2="574" y2="290" stroke="black" />
+<line x1="318" y1="216" x2="350" y2="290" stroke="black" />
+<line x1="888" y1="216" x2="606" y2="290" stroke="black" />
+<line x1="350" y1="216" x2="126" y2="290" stroke="black" />
+<line x1="920" y1="216" x2="382" y2="290" stroke="black" />
+<line x1="350" y1="216" x2="158" y2="290" stroke="black" />
+<line x1="920" y1="216" x2="414" y2="290" stroke="black" />
+<line x1="350" y1="216" x2="190" y2="290" stroke="black" />
+<line x1="920" y1="216" x2="446" y2="290" stroke="black" />
+<line x1="350" y1="216" x2="222" y2="290" stroke="black" />
+<line x1="920" y1="216" x2="478" y2="290" stroke="black" />
+<line x1="350" y1="216" x2="254" y2="290" stroke="black" />
+<line x1="920" y1="216" x2="510" y2="290" stroke="black" />
+<line x1="350" y1="216" x2="286" y2="290" stroke="black" />
+<line x1="920" y1="216" x2="542" y2="290" stroke="black" />
+<line x1="350" y1="216" x2="318" y2="290" stroke="black" />
+<line x1="920" y1="216" x2="574" y2="290" stroke="black" />
+<line x1="350" y1="216" x2="350" y2="290" stroke="black" />
+<line x1="920" y1="216" x2="606" y2="290" stroke="black" />
+<line x1="382" y1="216" x2="126" y2="290" stroke="black" />
+<line x1="952" y1="216" x2="382" y2="290" stroke="black" />
+<line x1="382" y1="216" x2="158" y2="290" stroke="black" />
+<line x1="952" y1="216" x2="414" y2="290" stroke="black" />
+<line x1="382" y1="216" x2="190" y2="290" stroke="black" />
+<line x1="952" y1="216" x2="446" y2="290" stroke="black" />
+<line x1="382" y1="216" x2="222" y2="290" stroke="black" />
+<line x1="952" y1="216" x2="478" y2="290" stroke="black" />
+<line x1="382" y1="216" x2="254" y2="290" stroke="black" />
+<line x1="952" y1="216" x2="510" y2="290" stroke="black" />
+<line x1="382" y1="216" x2="286" y2="290" stroke="black" />
+<line x1="952" y1="216" x2="542" y2="290" stroke="black" />
+<line x1="382" y1="216" x2="318" y2="290" stroke="black" />
+<line x1="952" y1="216" x2="574" y2="290" stroke="black" />
+<line x1="382" y1="216" x2="350" y2="290" stroke="black" />
+<line x1="952" y1="216" x2="606" y2="290" stroke="black" />
+<line x1="414" y1="216" x2="126" y2="290" stroke="black" />
+<line x1="984" y1="216" x2="382" y2="290" stroke="black" />
+<line x1="414" y1="216" x2="158" y2="290" stroke="black" />
+<line x1="984" y1="216" x2="414" y2="290" stroke="black" />
+<line x1="414" y1="216" x2="190" y2="290" stroke="black" />
+<line x1="984" y1="216" x2="446" y2="290" stroke="black" />
+<line x1="414" y1="216" x2="222" y2="290" stroke="black" />
+<line x1="984" y1="216" x2="478" y2="290" stroke="black" />
+<line x1="414" y1="216" x2="254" y2="290" stroke="black" />
+<line x1="984" y1="216" x2="510" y2="290" stroke="black" />
+<line x1="414" y1="216" x2="286" y2="290" stroke="black" />
+<line x1="984" y1="216" x2="542" y2="290" stroke="black" />
+<line x1="414" y1="216" x2="318" y2="290" stroke="black" />
+<line x1="984" y1="216" x2="574" y2="290" stroke="black" />
+<line x1="414" y1="216" x2="350" y2="290" stroke="black" />
+<line x1="984" y1="216" x2="606" y2="290" stroke="black" />
+<line x1="446" y1="216" x2="126" y2="290" stroke="black" />
+<line x1="1016" y1="216" x2="382" y2="290" stroke="black" />
+<line x1="446" y1="216" x2="158" y2="290" stroke="black" />
+<line x1="1016" y1="216" x2="414" y2="290" stroke="black" />
+<line x1="446" y1="216" x2="190" y2="290" stroke="black" />
+<line x1="1016" y1="216" x2="446" y2="290" stroke="black" />
+<line x1="446" y1="216" x2="222" y2="290" stroke="black" />
+<line x1="1016" y1="216" x2="478" y2="290" stroke="black" />
+<line x1="446" y1="216" x2="254" y2="290" stroke="black" />
+<line x1="1016" y1="216" x2="510" y2="290" stroke="black" />
+<line x1="446" y1="216" x2="286" y2="290" stroke="black" />
+<line x1="1016" y1="216" x2="542" y2="290" stroke="black" />
+<line x1="446" y1="216" x2="318" y2="290" stroke="black" />
+<line x1="1016" y1="216" x2="574" y2="290" stroke="black" />
+<line x1="446" y1="216" x2="350" y2="290" stroke="black" />
+<line x1="1016" y1="216" x2="606" y2="290" stroke="black" />
+<line x1="478" y1="216" x2="126" y2="290" stroke="black" />
+<line x1="1048" y1="216" x2="382" y2="290" stroke="black" />
+<line x1="478" y1="216" x2="158" y2="290" stroke="black" />
+<line x1="1048" y1="216" x2="414" y2="290" stroke="black" />
+<line x1="478" y1="216" x2="190" y2="290" stroke="black" />
+<line x1="1048" y1="216" x2="446" y2="290" stroke="black" />
+<line x1="478" y1="216" x2="222" y2="290" stroke="black" />
+<line x1="1048" y1="216" x2="478" y2="290" stroke="black" />
+<line x1="478" y1="216" x2="254" y2="290" stroke="black" />
+<line x1="1048" y1="216" x2="510" y2="290" stroke="black" />
+<line x1="478" y1="216" x2="286" y2="290" stroke="black" />
+<line x1="1048" y1="216" x2="542" y2="290" stroke="black" />
+<line x1="478" y1="216" x2="318" y2="290" stroke="black" />
+<line x1="1048" y1="216" x2="574" y2="290" stroke="black" />
+<line x1="478" y1="216" x2="350" y2="290" stroke="black" />
+<line x1="1048" y1="216" x2="606" y2="290" stroke="black" />
+<line x1="510" y1="216" x2="126" y2="290" stroke="black" />
+<line x1="1080" y1="216" x2="382" y2="290" stroke="black" />
+<line x1="510" y1="216" x2="158" y2="290" stroke="black" />
+<line x1="1080" y1="216" x2="414" y2="290" stroke="black" />
+<line x1="510" y1="216" x2="190" y2="290" stroke="black" />
+<line x1="1080" y1="216" x2="446" y2="290" stroke="black" />
+<line x1="510" y1="216" x2="222" y2="290" stroke="black" />
+<line x1="1080" y1="216" x2="478" y2="290" stroke="black" />
+<line x1="510" y1="216" x2="254" y2="290" stroke="black" />
+<line x1="1080" y1="216" x2="510" y2="290" stroke="black" />
+<line x1="510" y1="216" x2="286" y2="290" stroke="black" />
+<line x1="1080" y1="216" x2="542" y2="290" stroke="black" />
+<line x1="510" y1="216" x2="318" y2="290" stroke="black" />
+<line x1="1080" y1="216" x2="574" y2="290" stroke="black" />
+<line x1="510" y1="216" x2="350" y2="290" stroke="black" />
+<line x1="1080" y1="216" x2="606" y2="290" stroke="black" />
+<line x1="542" y1="216" x2="126" y2="290" stroke="black" />
+<line x1="1112" y1="216" x2="382" y2="290" stroke="black" />
+<line x1="542" y1="216" x2="158" y2="290" stroke="black" />
+<line x1="1112" y1="216" x2="414" y2="290" stroke="black" />
+<line x1="542" y1="216" x2="190" y2="290" stroke="black" />
+<line x1="1112" y1="216" x2="446" y2="290" stroke="black" />
+<line x1="542" y1="216" x2="222" y2="290" stroke="black" />
+<line x1="1112" y1="216" x2="478" y2="290" stroke="black" />
+<line x1="542" y1="216" x2="254" y2="290" stroke="black" />
+<line x1="1112" y1="216" x2="510" y2="290" stroke="black" />
+<line x1="542" y1="216" x2="286" y2="290" stroke="black" />
+<line x1="1112" y1="216" x2="542" y2="290" stroke="black" />
+<line x1="542" y1="216" x2="318" y2="290" stroke="black" />
+<line x1="1112" y1="216" x2="574" y2="290" stroke="black" />
+<line x1="542" y1="216" x2="350" y2="290" stroke="black" />
+<line x1="1112" y1="216" x2="606" y2="290" stroke="black" />
+<line x1="574" y1="216" x2="126" y2="290" stroke="black" />
+<line x1="1144" y1="216" x2="382" y2="290" stroke="black" />
+<line x1="574" y1="216" x2="158" y2="290" stroke="black" />
+<line x1="1144" y1="216" x2="414" y2="290" stroke="black" />
+<line x1="574" y1="216" x2="190" y2="290" stroke="black" />
+<line x1="1144" y1="216" x2="446" y2="290" stroke="black" />
+<line x1="574" y1="216" x2="222" y2="290" stroke="black" />
+<line x1="1144" y1="216" x2="478" y2="290" stroke="black" />
+<line x1="574" y1="216" x2="254" y2="290" stroke="black" />
+<line x1="1144" y1="216" x2="510" y2="290" stroke="black" />
+<line x1="574" y1="216" x2="286" y2="290" stroke="black" />
+<line x1="1144" y1="216" x2="542" y2="290" stroke="black" />
+<line x1="574" y1="216" x2="318" y2="290" stroke="black" />
+<line x1="1144" y1="216" x2="574" y2="290" stroke="black" />
+<line x1="574" y1="216" x2="350" y2="290" stroke="black" />
+<line x1="1144" y1="216" x2="606" y2="290" stroke="black" />
+<line x1="606" y1="216" x2="126" y2="290" stroke="black" />
+<line x1="1176" y1="216" x2="382" y2="290" stroke="black" />
+<line x1="606" y1="216" x2="158" y2="290" stroke="black" />
+<line x1="1176" y1="216" x2="414" y2="290" stroke="black" />
+<line x1="606" y1="216" x2="190" y2="290" stroke="black" />
+<line x1="1176" y1="216" x2="446" y2="290" stroke="black" />
+<line x1="606" y1="216" x2="222" y2="290" stroke="black" />
+<line x1="1176" y1="216" x2="478" y2="290" stroke="black" />
+<line x1="606" y1="216" x2="254" y2="290" stroke="black" />
+<line x1="1176" y1="216" x2="510" y2="290" stroke="black" />
+<line x1="606" y1="216" x2="286" y2="290" stroke="black" />
+<line x1="1176" y1="216" x2="542" y2="290" stroke="black" />
+<line x1="606" y1="216" x2="318" y2="290" stroke="black" />
+<line x1="1176" y1="216" x2="574" y2="290" stroke="black" />
+<line x1="606" y1="216" x2="350" y2="290" stroke="black" />
+<line x1="1176" y1="216" x2="606" y2="290" stroke="black" />
+<line x1="126" y1="316" x2="126" y2="390" stroke="black" />
+<line x1="158" y1="316" x2="158" y2="390" stroke="black" />
+<line x1="190" y1="316" x2="190" y2="390" stroke="black" />
+<line x1="222" y1="316" x2="222" y2="390" stroke="black" />
+<line x1="254" y1="316" x2="254" y2="390" stroke="black" />
+<line x1="286" y1="316" x2="286" y2="390" stroke="black" />
+<line x1="318" y1="316" x2="318" y2="390" stroke="black" />
+<line x1="350" y1="316" x2="350" y2="390" stroke="black" />
+<line x1="382" y1="316" x2="382" y2="390" stroke="black" />
+<line x1="414" y1="316" x2="414" y2="390" stroke="black" />
+<line x1="446" y1="316" x2="446" y2="390" stroke="black" />
+<line x1="478" y1="316" x2="478" y2="390" stroke="black" />
+<line x1="510" y1="316" x2="510" y2="390" stroke="black" />
+<line x1="542" y1="316" x2="542" y2="390" stroke="black" />
+<line x1="574" y1="316" x2="574" y2="390" stroke="black" />
+<line x1="606" y1="316" x2="606" y2="390" stroke="black" />
+</svg>
diff --git a/diagram/xvshuf_w.svg b/diagram/xvshuf_w.svg
new file mode 100644
index 00000000..9e4999c2
--- /dev/null
+++ b/diagram/xvshuf_w.svg
@@ -0,0 +1,192 @@
+<svg version="1.1"
+     width="1240" height="450"
+     xmlns="http://www.w3.org/2000/svg">
+<text x="20" y="100">(1)</text>
+<text x="100" y="100" text-anchor="end" font-style="italic">b</text>
+<text x="100" y="115" text-anchor="end">data</text>
+<rect x="110" y="90" width="64" height="26" fill="white" stroke="blue" />
+<text x="142" y="103" text-anchor="middle" dominant-baseline="middle">7</text>
+<rect x="174" y="90" width="64" height="26" fill="white" stroke="blue" />
+<text x="206" y="103" text-anchor="middle" dominant-baseline="middle">6</text>
+<rect x="238" y="90" width="64" height="26" fill="white" stroke="blue" />
+<text x="270" y="103" text-anchor="middle" dominant-baseline="middle">5</text>
+<rect x="302" y="90" width="64" height="26" fill="white" stroke="blue" />
+<text x="334" y="103" text-anchor="middle" dominant-baseline="middle">4</text>
+<rect x="366" y="90" width="64" height="26" fill="white" stroke="blue" />
+<text x="398" y="103" text-anchor="middle" dominant-baseline="middle">7</text>
+<rect x="430" y="90" width="64" height="26" fill="white" stroke="blue" />
+<text x="462" y="103" text-anchor="middle" dominant-baseline="middle">6</text>
+<rect x="494" y="90" width="64" height="26" fill="white" stroke="blue" />
+<text x="526" y="103" text-anchor="middle" dominant-baseline="middle">5</text>
+<rect x="558" y="90" width="64" height="26" fill="white" stroke="blue" />
+<text x="590" y="103" text-anchor="middle" dominant-baseline="middle">4</text>
+<text x="110" y="80" text-anchor="begin">upper</text>
+<text x="622" y="80" text-anchor="end">lower</text>
+<text x="670" y="100" text-anchor="end" font-style="italic">c</text>
+<text x="670" y="115" text-anchor="end">data</text>
+<rect x="680" y="90" width="64" height="26" fill="white" stroke="blue" />
+<text x="712" y="103" text-anchor="middle" dominant-baseline="middle">3</text>
+<rect x="744" y="90" width="64" height="26" fill="white" stroke="blue" />
+<text x="776" y="103" text-anchor="middle" dominant-baseline="middle">2</text>
+<rect x="808" y="90" width="64" height="26" fill="white" stroke="blue" />
+<text x="840" y="103" text-anchor="middle" dominant-baseline="middle">1</text>
+<rect x="872" y="90" width="64" height="26" fill="white" stroke="blue" />
+<text x="904" y="103" text-anchor="middle" dominant-baseline="middle">0</text>
+<rect x="936" y="90" width="64" height="26" fill="white" stroke="blue" />
+<text x="968" y="103" text-anchor="middle" dominant-baseline="middle">3</text>
+<rect x="1000" y="90" width="64" height="26" fill="white" stroke="blue" />
+<text x="1032" y="103" text-anchor="middle" dominant-baseline="middle">2</text>
+<rect x="1064" y="90" width="64" height="26" fill="white" stroke="blue" />
+<text x="1096" y="103" text-anchor="middle" dominant-baseline="middle">1</text>
+<rect x="1128" y="90" width="64" height="26" fill="white" stroke="blue" />
+<text x="1160" y="103" text-anchor="middle" dominant-baseline="middle">0</text>
+<text x="680" y="80" text-anchor="begin">upper</text>
+<text x="1192" y="80" text-anchor="end">lower</text>
+<text x="20" y="200">(2)</text>
+<text x="100" y="200" text-anchor="end" font-style="italic">hi</text>
+<text x="100" y="215" text-anchor="end">merged</text>
+<rect x="110" y="190" width="64" height="26" fill="white" stroke="blue" />
+<text x="142" y="203" text-anchor="middle" dominant-baseline="middle">7</text>
+<rect x="174" y="190" width="64" height="26" fill="white" stroke="blue" />
+<text x="206" y="203" text-anchor="middle" dominant-baseline="middle">6</text>
+<rect x="238" y="190" width="64" height="26" fill="white" stroke="blue" />
+<text x="270" y="203" text-anchor="middle" dominant-baseline="middle">5</text>
+<rect x="302" y="190" width="64" height="26" fill="white" stroke="blue" />
+<text x="334" y="203" text-anchor="middle" dominant-baseline="middle">4</text>
+<rect x="366" y="190" width="64" height="26" fill="white" stroke="blue" />
+<text x="398" y="203" text-anchor="middle" dominant-baseline="middle">3</text>
+<rect x="430" y="190" width="64" height="26" fill="white" stroke="blue" />
+<text x="462" y="203" text-anchor="middle" dominant-baseline="middle">2</text>
+<rect x="494" y="190" width="64" height="26" fill="white" stroke="blue" />
+<text x="526" y="203" text-anchor="middle" dominant-baseline="middle">1</text>
+<rect x="558" y="190" width="64" height="26" fill="white" stroke="blue" />
+<text x="590" y="203" text-anchor="middle" dominant-baseline="middle">0</text>
+<text x="670" y="200" text-anchor="end" font-style="italic">lo</text>
+<text x="670" y="215" text-anchor="end">merged</text>
+<rect x="680" y="190" width="64" height="26" fill="white" stroke="blue" />
+<text x="712" y="203" text-anchor="middle" dominant-baseline="middle">7</text>
+<rect x="744" y="190" width="64" height="26" fill="white" stroke="blue" />
+<text x="776" y="203" text-anchor="middle" dominant-baseline="middle">6</text>
+<rect x="808" y="190" width="64" height="26" fill="white" stroke="blue" />
+<text x="840" y="203" text-anchor="middle" dominant-baseline="middle">5</text>
+<rect x="872" y="190" width="64" height="26" fill="white" stroke="blue" />
+<text x="904" y="203" text-anchor="middle" dominant-baseline="middle">4</text>
+<rect x="936" y="190" width="64" height="26" fill="white" stroke="blue" />
+<text x="968" y="203" text-anchor="middle" dominant-baseline="middle">3</text>
+<rect x="1000" y="190" width="64" height="26" fill="white" stroke="blue" />
+<text x="1032" y="203" text-anchor="middle" dominant-baseline="middle">2</text>
+<rect x="1064" y="190" width="64" height="26" fill="white" stroke="blue" />
+<text x="1096" y="203" text-anchor="middle" dominant-baseline="middle">1</text>
+<rect x="1128" y="190" width="64" height="26" fill="white" stroke="blue" />
+<text x="1160" y="203" text-anchor="middle" dominant-baseline="middle">0</text>
+<text x="20" y="300">(3)</text>
+<text x="100" y="300" text-anchor="end" font-style="italic">a</text>
+<text x="100" y="315" text-anchor="end">indices</text>
+<rect x="110" y="290" width="64" height="26" fill="white" stroke="blue" />
+<rect x="174" y="290" width="64" height="26" fill="white" stroke="blue" />
+<rect x="238" y="290" width="64" height="26" fill="white" stroke="blue" />
+<rect x="302" y="290" width="64" height="26" fill="white" stroke="blue" />
+<rect x="366" y="290" width="64" height="26" fill="white" stroke="blue" />
+<rect x="430" y="290" width="64" height="26" fill="white" stroke="blue" />
+<rect x="494" y="290" width="64" height="26" fill="white" stroke="blue" />
+<rect x="558" y="290" width="64" height="26" fill="white" stroke="blue" />
+<text x="20" y="400">(4)</text>
+<text x="100" y="400" text-anchor="end" font-style="italic">ret</text>
+<text x="100" y="415" text-anchor="end">returns</text>
+<rect x="110" y="390" width="64" height="26" fill="white" stroke="blue" />
+<rect x="174" y="390" width="64" height="26" fill="white" stroke="blue" />
+<rect x="238" y="390" width="64" height="26" fill="white" stroke="blue" />
+<rect x="302" y="390" width="64" height="26" fill="white" stroke="blue" />
+<rect x="366" y="390" width="64" height="26" fill="white" stroke="blue" />
+<rect x="430" y="390" width="64" height="26" fill="white" stroke="blue" />
+<rect x="494" y="390" width="64" height="26" fill="white" stroke="blue" />
+<rect x="558" y="390" width="64" height="26" fill="white" stroke="blue" />
+<line x1="142" y1="116" x2="142" y2="190" stroke="black" />
+<line x1="398" y1="116" x2="712" y2="190" stroke="black" />
+<line x1="206" y1="116" x2="206" y2="190" stroke="black" />
+<line x1="462" y1="116" x2="776" y2="190" stroke="black" />
+<line x1="270" y1="116" x2="270" y2="190" stroke="black" />
+<line x1="526" y1="116" x2="840" y2="190" stroke="black" />
+<line x1="334" y1="116" x2="334" y2="190" stroke="black" />
+<line x1="590" y1="116" x2="904" y2="190" stroke="black" />
+<line x1="712" y1="116" x2="398" y2="190" stroke="black" />
+<line x1="968" y1="116" x2="968" y2="190" stroke="black" />
+<line x1="776" y1="116" x2="462" y2="190" stroke="black" />
+<line x1="1032" y1="116" x2="1032" y2="190" stroke="black" />
+<line x1="840" y1="116" x2="526" y2="190" stroke="black" />
+<line x1="1096" y1="116" x2="1096" y2="190" stroke="black" />
+<line x1="904" y1="116" x2="590" y2="190" stroke="black" />
+<line x1="1160" y1="116" x2="1160" y2="190" stroke="black" />
+<line x1="142" y1="216" x2="142" y2="290" stroke="black" />
+<line x1="712" y1="216" x2="398" y2="290" stroke="black" />
+<line x1="142" y1="216" x2="206" y2="290" stroke="black" />
+<line x1="712" y1="216" x2="462" y2="290" stroke="black" />
+<line x1="142" y1="216" x2="270" y2="290" stroke="black" />
+<line x1="712" y1="216" x2="526" y2="290" stroke="black" />
+<line x1="142" y1="216" x2="334" y2="290" stroke="black" />
+<line x1="712" y1="216" x2="590" y2="290" stroke="black" />
+<line x1="206" y1="216" x2="142" y2="290" stroke="black" />
+<line x1="776" y1="216" x2="398" y2="290" stroke="black" />
+<line x1="206" y1="216" x2="206" y2="290" stroke="black" />
+<line x1="776" y1="216" x2="462" y2="290" stroke="black" />
+<line x1="206" y1="216" x2="270" y2="290" stroke="black" />
+<line x1="776" y1="216" x2="526" y2="290" stroke="black" />
+<line x1="206" y1="216" x2="334" y2="290" stroke="black" />
+<line x1="776" y1="216" x2="590" y2="290" stroke="black" />
+<line x1="270" y1="216" x2="142" y2="290" stroke="black" />
+<line x1="840" y1="216" x2="398" y2="290" stroke="black" />
+<line x1="270" y1="216" x2="206" y2="290" stroke="black" />
+<line x1="840" y1="216" x2="462" y2="290" stroke="black" />
+<line x1="270" y1="216" x2="270" y2="290" stroke="black" />
+<line x1="840" y1="216" x2="526" y2="290" stroke="black" />
+<line x1="270" y1="216" x2="334" y2="290" stroke="black" />
+<line x1="840" y1="216" x2="590" y2="290" stroke="black" />
+<line x1="334" y1="216" x2="142" y2="290" stroke="black" />
+<line x1="904" y1="216" x2="398" y2="290" stroke="black" />
+<line x1="334" y1="216" x2="206" y2="290" stroke="black" />
+<line x1="904" y1="216" x2="462" y2="290" stroke="black" />
+<line x1="334" y1="216" x2="270" y2="290" stroke="black" />
+<line x1="904" y1="216" x2="526" y2="290" stroke="black" />
+<line x1="334" y1="216" x2="334" y2="290" stroke="black" />
+<line x1="904" y1="216" x2="590" y2="290" stroke="black" />
+<line x1="398" y1="216" x2="142" y2="290" stroke="black" />
+<line x1="968" y1="216" x2="398" y2="290" stroke="black" />
+<line x1="398" y1="216" x2="206" y2="290" stroke="black" />
+<line x1="968" y1="216" x2="462" y2="290" stroke="black" />
+<line x1="398" y1="216" x2="270" y2="290" stroke="black" />
+<line x1="968" y1="216" x2="526" y2="290" stroke="black" />
+<line x1="398" y1="216" x2="334" y2="290" stroke="black" />
+<line x1="968" y1="216" x2="590" y2="290" stroke="black" />
+<line x1="462" y1="216" x2="142" y2="290" stroke="black" />
+<line x1="1032" y1="216" x2="398" y2="290" stroke="black" />
+<line x1="462" y1="216" x2="206" y2="290" stroke="black" />
+<line x1="1032" y1="216" x2="462" y2="290" stroke="black" />
+<line x1="462" y1="216" x2="270" y2="290" stroke="black" />
+<line x1="1032" y1="216" x2="526" y2="290" stroke="black" />
+<line x1="462" y1="216" x2="334" y2="290" stroke="black" />
+<line x1="1032" y1="216" x2="590" y2="290" stroke="black" />
+<line x1="526" y1="216" x2="142" y2="290" stroke="black" />
+<line x1="1096" y1="216" x2="398" y2="290" stroke="black" />
+<line x1="526" y1="216" x2="206" y2="290" stroke="black" />
+<line x1="1096" y1="216" x2="462" y2="290" stroke="black" />
+<line x1="526" y1="216" x2="270" y2="290" stroke="black" />
+<line x1="1096" y1="216" x2="526" y2="290" stroke="black" />
+<line x1="526" y1="216" x2="334" y2="290" stroke="black" />
+<line x1="1096" y1="216" x2="590" y2="290" stroke="black" />
+<line x1="590" y1="216" x2="142" y2="290" stroke="black" />
+<line x1="1160" y1="216" x2="398" y2="290" stroke="black" />
+<line x1="590" y1="216" x2="206" y2="290" stroke="black" />
+<line x1="1160" y1="216" x2="462" y2="290" stroke="black" />
+<line x1="590" y1="216" x2="270" y2="290" stroke="black" />
+<line x1="1160" y1="216" x2="526" y2="290" stroke="black" />
+<line x1="590" y1="216" x2="334" y2="290" stroke="black" />
+<line x1="1160" y1="216" x2="590" y2="290" stroke="black" />
+<line x1="142" y1="316" x2="142" y2="390" stroke="black" />
+<line x1="206" y1="316" x2="206" y2="390" stroke="black" />
+<line x1="270" y1="316" x2="270" y2="390" stroke="black" />
+<line x1="334" y1="316" x2="334" y2="390" stroke="black" />
+<line x1="398" y1="316" x2="398" y2="390" stroke="black" />
+<line x1="462" y1="316" x2="462" y2="390" stroke="black" />
+<line x1="526" y1="316" x2="526" y2="390" stroke="black" />
+<line x1="590" y1="316" x2="590" y2="390" stroke="black" />
+</svg>
diff --git a/img/favicon.ico b/img/favicon.ico
new file mode 100644
index 00000000..e85006a3
Binary files /dev/null and b/img/favicon.ico differ
diff --git a/index.html b/index.html
new file mode 100644
index 00000000..84fe14a0
--- /dev/null
+++ b/index.html
@@ -0,0 +1,227 @@
+<!DOCTYPE html>
+<html class="writer-html5" lang="en" >
+<head>
+    <meta charset="utf-8" />
+    <meta http-equiv="X-UA-Compatible" content="IE=edge" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="description" content="Unofficial LoongArch Intrinsics Guide" /><link rel="canonical" href="https://jia.je/unofficial-loongarch-intrinsics-guide/" />
+      <link rel="shortcut icon" href="img/favicon.ico" />
+    <title>Unofficial LoongArch Intrinsics Guide</title>
+    <link rel="stylesheet" href="css/theme.css" />
+    <link rel="stylesheet" href="css/theme_extra.css" />
+        <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.8.0/styles/github.min.css" />
+        <link href="main.css" rel="stylesheet" />
+    
+      <script>
+        // Current page data
+        var mkdocs_page_name = "Unofficial LoongArch Intrinsics Guide";
+        var mkdocs_page_input_path = "index.md";
+        var mkdocs_page_url = "/unofficial-loongarch-intrinsics-guide/";
+      </script>
+    
+    <!--[if lt IE 9]>
+      <script src="js/html5shiv.min.js"></script>
+    <![endif]-->
+      <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.8.0/highlight.min.js"></script>
+      <script>hljs.highlightAll();</script> 
+</head>
+
+<body class="wy-body-for-nav" role="document">
+
+  <div class="wy-grid-for-nav">
+    <nav data-toggle="wy-nav-shift" class="wy-nav-side stickynav">
+    <div class="wy-side-scroll">
+      <div class="wy-side-nav-search">
+          <a href="." class="icon icon-home"> Unofficial LoongArch Intrinsics Guide
+        </a>
+      </div>
+
+      <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
+              <ul class="current">
+                <li class="toctree-l1 current"><a class="reference internal current" href=".">Unofficial LoongArch Intrinsics Guide</a>
+    <ul class="current">
+    </ul>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="latency_throughput/">Latency and Throughput of Instructions</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="migrating_avx/">Migrating from AVX to LASX</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="migrating_sse/">Migrating from SSE to LSX</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="viewer/">Browse All Intrinsics</a>
+                </li>
+              </ul>
+              <p class="caption"><span class="caption-text">Lasx</span></p>
+              <ul>
+                  <li class="toctree-l1"><a class="reference internal" href="lasx/bitwise_operations/">Bitwise Operations</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="lasx/branch/">Branch</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="lasx/float_comparison/">Floating Point Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="lasx/float_computation/">Floating Point Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="lasx/float_conversion/">Floating Point Conversion</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="lasx/float_misc/">Floating Point Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="lasx/fma/">Fused Multiply-Add</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="lasx/integer_comparison/">Integer Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="lasx/integer_computation/">Integer Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="lasx/logical/">Logical</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="lasx/memory/">Memory Load & Store</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="lasx/misc/">Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="lasx/permutation/">Permutation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="lasx/shift/">Shift</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="lasx/shuffling/">Shuffling</a>
+                  </li>
+              </ul>
+              <p class="caption"><span class="caption-text">Lsx</span></p>
+              <ul>
+                  <li class="toctree-l1"><a class="reference internal" href="lsx/bitwise_operations/">Bitwise Operations</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="lsx/branch/">Branch</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="lsx/float_comparison/">Floating Point Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="lsx/float_computation/">Floating Point Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="lsx/float_conversion/">Floating Point Conversion</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="lsx/float_misc/">Floating Point Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="lsx/fma/">Fused Multiply-Add</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="lsx/integer_comparison/">Integer Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="lsx/integer_computation/">Integer Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="lsx/logical/">Logical</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="lsx/memory/">Memory Load & Store</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="lsx/misc/">Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="lsx/permutation/">Permutation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="lsx/shift/">Shift</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="lsx/shuffling/">Shuffling</a>
+                  </li>
+              </ul>
+      </div>
+    </div>
+    </nav>
+
+    <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
+      <nav class="wy-nav-top" role="navigation" aria-label="Mobile navigation menu">
+          <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
+          <a href=".">Unofficial LoongArch Intrinsics Guide</a>
+        
+      </nav>
+      <div class="wy-nav-content">
+        <div class="rst-content"><div role="navigation" aria-label="breadcrumbs navigation">
+  <ul class="wy-breadcrumbs">
+    <li><a href="." class="icon icon-home" aria-label="Docs"></a></li>
+      <li class="breadcrumb-item active">Unofficial LoongArch Intrinsics Guide</li>
+    <li class="wy-breadcrumbs-aside">
+    </li>
+  </ul>
+  <hr/>
+</div>
+          <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
+            <div class="section" itemprop="articleBody">
+              
+                <h1 id="unofficial-loongarch-intrinsics-guide">Unofficial LoongArch Intrinsics Guide</h1>
+<p>This is the Unofficial LoongArch Intrinsics Guide by Jiajie Chen et, al. The documentation is arranged from the following sources:</p>
+<ul>
+<li>QEMU</li>
+<li>GCC</li>
+<li>Observations from real hardware incl. 3C5000 and 3A6000</li>
+</ul>
+<p>The guide provides pseudo code for the SIMD intrinsics. The code assumes that the elements of the LSX/LASX vector registers can be accessed via members of a <code>union</code>:</p>
+<pre><code class="language-cpp">union lsx_register {
+  uint8_t byte[16];
+  uint16_t half[8];
+  uint32_t word[4];
+  uint64_t dword[2];
+  uint128_t qword[1];
+  float fp32[4];
+  double fp64[2];
+};
+
+union lasx_register {
+  uint8_t byte[32];
+  uint16_t half[16];
+  uint32_t word[8];
+  uint64_t dword[4];
+  uint128_t qword[2];
+  float fp32[8];
+  double fp64[4];
+};
+</code></pre>
+              
+            </div>
+          </div><footer>
+    <div class="rst-footer-buttons" role="navigation" aria-label="Footer Navigation">
+        <a href="latency_throughput/" class="btn btn-neutral float-right" title="Latency and Throughput of Instructions">Next <span class="icon icon-circle-arrow-right"></span></a>
+    </div>
+
+  <hr/>
+
+  <div role="contentinfo">
+    <!-- Copyright etc -->
+      <p>Copyright &copy; 2023 Jiajie Chen</p>
+  </div>
+
+  Built with <a href="https://www.mkdocs.org/">MkDocs</a> using a <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
+</footer>
+          
+        </div>
+      </div>
+
+    </section>
+
+  </div>
+
+  <div class="rst-versions" role="note" aria-label="Versions">
+  <span class="rst-current-version" data-toggle="rst-current-version">
+    
+    
+    
+      <span><a href="latency_throughput/" style="color: #fcfcfc">Next &raquo;</a></span>
+    
+  </span>
+</div>
+    <script src="js/jquery-3.6.0.min.js"></script>
+    <script>var base_url = ".";</script>
+    <script src="js/theme_extra.js"></script>
+    <script src="js/theme.js"></script>
+    <script>
+        jQuery(function () {
+            SphinxRtdTheme.Navigation.enable(true);
+        });
+    </script>
+
+</body>
+</html>
+
+<!--
+MkDocs version : 1.5.3
+Build Date UTC : 2024-07-17 05:31:45.828664+00:00
+-->
diff --git a/js/html5shiv.min.js b/js/html5shiv.min.js
new file mode 100644
index 00000000..1a01c94b
--- /dev/null
+++ b/js/html5shiv.min.js
@@ -0,0 +1,4 @@
+/**
+* @preserve HTML5 Shiv 3.7.3 | @afarkas @jdalton @jon_neal @rem | MIT/GPL2 Licensed
+*/
+!function(a,b){function c(a,b){var c=a.createElement("p"),d=a.getElementsByTagName("head")[0]||a.documentElement;return c.innerHTML="x<style>"+b+"</style>",d.insertBefore(c.lastChild,d.firstChild)}function d(){var a=t.elements;return"string"==typeof a?a.split(" "):a}function e(a,b){var c=t.elements;"string"!=typeof c&&(c=c.join(" ")),"string"!=typeof a&&(a=a.join(" ")),t.elements=c+" "+a,j(b)}function f(a){var b=s[a[q]];return b||(b={},r++,a[q]=r,s[r]=b),b}function g(a,c,d){if(c||(c=b),l)return c.createElement(a);d||(d=f(c));var e;return e=d.cache[a]?d.cache[a].cloneNode():p.test(a)?(d.cache[a]=d.createElem(a)).cloneNode():d.createElem(a),!e.canHaveChildren||o.test(a)||e.tagUrn?e:d.frag.appendChild(e)}function h(a,c){if(a||(a=b),l)return a.createDocumentFragment();c=c||f(a);for(var e=c.frag.cloneNode(),g=0,h=d(),i=h.length;i>g;g++)e.createElement(h[g]);return e}function i(a,b){b.cache||(b.cache={},b.createElem=a.createElement,b.createFrag=a.createDocumentFragment,b.frag=b.createFrag()),a.createElement=function(c){return t.shivMethods?g(c,a,b):b.createElem(c)},a.createDocumentFragment=Function("h,f","return function(){var n=f.cloneNode(),c=n.createElement;h.shivMethods&&("+d().join().replace(/[\w\-:]+/g,function(a){return b.createElem(a),b.frag.createElement(a),'c("'+a+'")'})+");return n}")(t,b.frag)}function j(a){a||(a=b);var d=f(a);return!t.shivCSS||k||d.hasCSS||(d.hasCSS=!!c(a,"article,aside,dialog,figcaption,figure,footer,header,hgroup,main,nav,section{display:block}mark{background:#FF0;color:#000}template{display:none}")),l||i(a,d),a}var k,l,m="3.7.3",n=a.html5||{},o=/^<|^(?:button|map|select|textarea|object|iframe|option|optgroup)$/i,p=/^(?:a|b|code|div|fieldset|h1|h2|h3|h4|h5|h6|i|label|li|ol|p|q|span|strong|style|table|tbody|td|th|tr|ul)$/i,q="_html5shiv",r=0,s={};!function(){try{var a=b.createElement("a");a.innerHTML="<xyz></xyz>",k="hidden"in a,l=1==a.childNodes.length||function(){b.createElement("a");var a=b.createDocumentFragment();return"undefined"==typeof a.cloneNode||"undefined"==typeof a.createDocumentFragment||"undefined"==typeof a.createElement}()}catch(c){k=!0,l=!0}}();var t={elements:n.elements||"abbr article aside audio bdi canvas data datalist details dialog figcaption figure footer header hgroup main mark meter nav output picture progress section summary template time video",version:m,shivCSS:n.shivCSS!==!1,supportsUnknownElements:l,shivMethods:n.shivMethods!==!1,type:"default",shivDocument:j,createElement:g,createDocumentFragment:h,addElements:e};a.html5=t,j(b),"object"==typeof module&&module.exports&&(module.exports=t)}("undefined"!=typeof window?window:this,document);
diff --git a/js/jquery-3.6.0.min.js b/js/jquery-3.6.0.min.js
new file mode 100644
index 00000000..c4c6022f
--- /dev/null
+++ b/js/jquery-3.6.0.min.js
@@ -0,0 +1,2 @@
+/*! jQuery v3.6.0 | (c) OpenJS Foundation and other contributors | jquery.org/license */
+!function(e,t){"use strict";"object"==typeof module&&"object"==typeof module.exports?module.exports=e.document?t(e,!0):function(e){if(!e.document)throw new Error("jQuery requires a window with a document");return t(e)}:t(e)}("undefined"!=typeof window?window:this,function(C,e){"use strict";var t=[],r=Object.getPrototypeOf,s=t.slice,g=t.flat?function(e){return t.flat.call(e)}:function(e){return t.concat.apply([],e)},u=t.push,i=t.indexOf,n={},o=n.toString,v=n.hasOwnProperty,a=v.toString,l=a.call(Object),y={},m=function(e){return"function"==typeof e&&"number"!=typeof e.nodeType&&"function"!=typeof e.item},x=function(e){return null!=e&&e===e.window},E=C.document,c={type:!0,src:!0,nonce:!0,noModule:!0};function b(e,t,n){var r,i,o=(n=n||E).createElement("script");if(o.text=e,t)for(r in c)(i=t[r]||t.getAttribute&&t.getAttribute(r))&&o.setAttribute(r,i);n.head.appendChild(o).parentNode.removeChild(o)}function w(e){return null==e?e+"":"object"==typeof e||"function"==typeof e?n[o.call(e)]||"object":typeof e}var f="3.6.0",S=function(e,t){return new S.fn.init(e,t)};function p(e){var t=!!e&&"length"in e&&e.length,n=w(e);return!m(e)&&!x(e)&&("array"===n||0===t||"number"==typeof t&&0<t&&t-1 in e)}S.fn=S.prototype={jquery:f,constructor:S,length:0,toArray:function(){return s.call(this)},get:function(e){return null==e?s.call(this):e<0?this[e+this.length]:this[e]},pushStack:function(e){var t=S.merge(this.constructor(),e);return t.prevObject=this,t},each:function(e){return S.each(this,e)},map:function(n){return this.pushStack(S.map(this,function(e,t){return n.call(e,t,e)}))},slice:function(){return this.pushStack(s.apply(this,arguments))},first:function(){return this.eq(0)},last:function(){return this.eq(-1)},even:function(){return this.pushStack(S.grep(this,function(e,t){return(t+1)%2}))},odd:function(){return this.pushStack(S.grep(this,function(e,t){return t%2}))},eq:function(e){var t=this.length,n=+e+(e<0?t:0);return this.pushStack(0<=n&&n<t?[this[n]]:[])},end:function(){return this.prevObject||this.constructor()},push:u,sort:t.sort,splice:t.splice},S.extend=S.fn.extend=function(){var e,t,n,r,i,o,a=arguments[0]||{},s=1,u=arguments.length,l=!1;for("boolean"==typeof a&&(l=a,a=arguments[s]||{},s++),"object"==typeof a||m(a)||(a={}),s===u&&(a=this,s--);s<u;s++)if(null!=(e=arguments[s]))for(t in e)r=e[t],"__proto__"!==t&&a!==r&&(l&&r&&(S.isPlainObject(r)||(i=Array.isArray(r)))?(n=a[t],o=i&&!Array.isArray(n)?[]:i||S.isPlainObject(n)?n:{},i=!1,a[t]=S.extend(l,o,r)):void 0!==r&&(a[t]=r));return a},S.extend({expando:"jQuery"+(f+Math.random()).replace(/\D/g,""),isReady:!0,error:function(e){throw new Error(e)},noop:function(){},isPlainObject:function(e){var t,n;return!(!e||"[object Object]"!==o.call(e))&&(!(t=r(e))||"function"==typeof(n=v.call(t,"constructor")&&t.constructor)&&a.call(n)===l)},isEmptyObject:function(e){var t;for(t in e)return!1;return!0},globalEval:function(e,t,n){b(e,{nonce:t&&t.nonce},n)},each:function(e,t){var n,r=0;if(p(e)){for(n=e.length;r<n;r++)if(!1===t.call(e[r],r,e[r]))break}else for(r in e)if(!1===t.call(e[r],r,e[r]))break;return e},makeArray:function(e,t){var n=t||[];return null!=e&&(p(Object(e))?S.merge(n,"string"==typeof e?[e]:e):u.call(n,e)),n},inArray:function(e,t,n){return null==t?-1:i.call(t,e,n)},merge:function(e,t){for(var n=+t.length,r=0,i=e.length;r<n;r++)e[i++]=t[r];return e.length=i,e},grep:function(e,t,n){for(var r=[],i=0,o=e.length,a=!n;i<o;i++)!t(e[i],i)!==a&&r.push(e[i]);return r},map:function(e,t,n){var r,i,o=0,a=[];if(p(e))for(r=e.length;o<r;o++)null!=(i=t(e[o],o,n))&&a.push(i);else for(o in e)null!=(i=t(e[o],o,n))&&a.push(i);return g(a)},guid:1,support:y}),"function"==typeof Symbol&&(S.fn[Symbol.iterator]=t[Symbol.iterator]),S.each("Boolean Number String Function Array Date RegExp Object Error Symbol".split(" "),function(e,t){n["[object "+t+"]"]=t.toLowerCase()});var d=function(n){var e,d,b,o,i,h,f,g,w,u,l,T,C,a,E,v,s,c,y,S="sizzle"+1*new Date,p=n.document,k=0,r=0,m=ue(),x=ue(),A=ue(),N=ue(),j=function(e,t){return e===t&&(l=!0),0},D={}.hasOwnProperty,t=[],q=t.pop,L=t.push,H=t.push,O=t.slice,P=function(e,t){for(var n=0,r=e.length;n<r;n++)if(e[n]===t)return n;return-1},R="checked|selected|async|autofocus|autoplay|controls|defer|disabled|hidden|ismap|loop|multiple|open|readonly|required|scoped",M="[\\x20\\t\\r\\n\\f]",I="(?:\\\\[\\da-fA-F]{1,6}"+M+"?|\\\\[^\\r\\n\\f]|[\\w-]|[^\0-\\x7f])+",W="\\["+M+"*("+I+")(?:"+M+"*([*^$|!~]?=)"+M+"*(?:'((?:\\\\.|[^\\\\'])*)'|\"((?:\\\\.|[^\\\\\"])*)\"|("+I+"))|)"+M+"*\\]",F=":("+I+")(?:\\((('((?:\\\\.|[^\\\\'])*)'|\"((?:\\\\.|[^\\\\\"])*)\")|((?:\\\\.|[^\\\\()[\\]]|"+W+")*)|.*)\\)|)",B=new RegExp(M+"+","g"),$=new RegExp("^"+M+"+|((?:^|[^\\\\])(?:\\\\.)*)"+M+"+$","g"),_=new RegExp("^"+M+"*,"+M+"*"),z=new RegExp("^"+M+"*([>+~]|"+M+")"+M+"*"),U=new RegExp(M+"|>"),X=new RegExp(F),V=new RegExp("^"+I+"$"),G={ID:new RegExp("^#("+I+")"),CLASS:new RegExp("^\\.("+I+")"),TAG:new RegExp("^("+I+"|[*])"),ATTR:new RegExp("^"+W),PSEUDO:new RegExp("^"+F),CHILD:new RegExp("^:(only|first|last|nth|nth-last)-(child|of-type)(?:\\("+M+"*(even|odd|(([+-]|)(\\d*)n|)"+M+"*(?:([+-]|)"+M+"*(\\d+)|))"+M+"*\\)|)","i"),bool:new RegExp("^(?:"+R+")$","i"),needsContext:new RegExp("^"+M+"*[>+~]|:(even|odd|eq|gt|lt|nth|first|last)(?:\\("+M+"*((?:-\\d)?\\d*)"+M+"*\\)|)(?=[^-]|$)","i")},Y=/HTML$/i,Q=/^(?:input|select|textarea|button)$/i,J=/^h\d$/i,K=/^[^{]+\{\s*\[native \w/,Z=/^(?:#([\w-]+)|(\w+)|\.([\w-]+))$/,ee=/[+~]/,te=new RegExp("\\\\[\\da-fA-F]{1,6}"+M+"?|\\\\([^\\r\\n\\f])","g"),ne=function(e,t){var n="0x"+e.slice(1)-65536;return t||(n<0?String.fromCharCode(n+65536):String.fromCharCode(n>>10|55296,1023&n|56320))},re=/([\0-\x1f\x7f]|^-?\d)|^-$|[^\0-\x1f\x7f-\uFFFF\w-]/g,ie=function(e,t){return t?"\0"===e?"\ufffd":e.slice(0,-1)+"\\"+e.charCodeAt(e.length-1).toString(16)+" ":"\\"+e},oe=function(){T()},ae=be(function(e){return!0===e.disabled&&"fieldset"===e.nodeName.toLowerCase()},{dir:"parentNode",next:"legend"});try{H.apply(t=O.call(p.childNodes),p.childNodes),t[p.childNodes.length].nodeType}catch(e){H={apply:t.length?function(e,t){L.apply(e,O.call(t))}:function(e,t){var n=e.length,r=0;while(e[n++]=t[r++]);e.length=n-1}}}function se(t,e,n,r){var i,o,a,s,u,l,c,f=e&&e.ownerDocument,p=e?e.nodeType:9;if(n=n||[],"string"!=typeof t||!t||1!==p&&9!==p&&11!==p)return n;if(!r&&(T(e),e=e||C,E)){if(11!==p&&(u=Z.exec(t)))if(i=u[1]){if(9===p){if(!(a=e.getElementById(i)))return n;if(a.id===i)return n.push(a),n}else if(f&&(a=f.getElementById(i))&&y(e,a)&&a.id===i)return n.push(a),n}else{if(u[2])return H.apply(n,e.getElementsByTagName(t)),n;if((i=u[3])&&d.getElementsByClassName&&e.getElementsByClassName)return H.apply(n,e.getElementsByClassName(i)),n}if(d.qsa&&!N[t+" "]&&(!v||!v.test(t))&&(1!==p||"object"!==e.nodeName.toLowerCase())){if(c=t,f=e,1===p&&(U.test(t)||z.test(t))){(f=ee.test(t)&&ye(e.parentNode)||e)===e&&d.scope||((s=e.getAttribute("id"))?s=s.replace(re,ie):e.setAttribute("id",s=S)),o=(l=h(t)).length;while(o--)l[o]=(s?"#"+s:":scope")+" "+xe(l[o]);c=l.join(",")}try{return H.apply(n,f.querySelectorAll(c)),n}catch(e){N(t,!0)}finally{s===S&&e.removeAttribute("id")}}}return g(t.replace($,"$1"),e,n,r)}function ue(){var r=[];return function e(t,n){return r.push(t+" ")>b.cacheLength&&delete e[r.shift()],e[t+" "]=n}}function le(e){return e[S]=!0,e}function ce(e){var t=C.createElement("fieldset");try{return!!e(t)}catch(e){return!1}finally{t.parentNode&&t.parentNode.removeChild(t),t=null}}function fe(e,t){var n=e.split("|"),r=n.length;while(r--)b.attrHandle[n[r]]=t}function pe(e,t){var n=t&&e,r=n&&1===e.nodeType&&1===t.nodeType&&e.sourceIndex-t.sourceIndex;if(r)return r;if(n)while(n=n.nextSibling)if(n===t)return-1;return e?1:-1}function de(t){return function(e){return"input"===e.nodeName.toLowerCase()&&e.type===t}}function he(n){return function(e){var t=e.nodeName.toLowerCase();return("input"===t||"button"===t)&&e.type===n}}function ge(t){return function(e){return"form"in e?e.parentNode&&!1===e.disabled?"label"in e?"label"in e.parentNode?e.parentNode.disabled===t:e.disabled===t:e.isDisabled===t||e.isDisabled!==!t&&ae(e)===t:e.disabled===t:"label"in e&&e.disabled===t}}function ve(a){return le(function(o){return o=+o,le(function(e,t){var n,r=a([],e.length,o),i=r.length;while(i--)e[n=r[i]]&&(e[n]=!(t[n]=e[n]))})})}function ye(e){return e&&"undefined"!=typeof e.getElementsByTagName&&e}for(e in d=se.support={},i=se.isXML=function(e){var t=e&&e.namespaceURI,n=e&&(e.ownerDocument||e).documentElement;return!Y.test(t||n&&n.nodeName||"HTML")},T=se.setDocument=function(e){var t,n,r=e?e.ownerDocument||e:p;return r!=C&&9===r.nodeType&&r.documentElement&&(a=(C=r).documentElement,E=!i(C),p!=C&&(n=C.defaultView)&&n.top!==n&&(n.addEventListener?n.addEventListener("unload",oe,!1):n.attachEvent&&n.attachEvent("onunload",oe)),d.scope=ce(function(e){return a.appendChild(e).appendChild(C.createElement("div")),"undefined"!=typeof e.querySelectorAll&&!e.querySelectorAll(":scope fieldset div").length}),d.attributes=ce(function(e){return e.className="i",!e.getAttribute("className")}),d.getElementsByTagName=ce(function(e){return e.appendChild(C.createComment("")),!e.getElementsByTagName("*").length}),d.getElementsByClassName=K.test(C.getElementsByClassName),d.getById=ce(function(e){return a.appendChild(e).id=S,!C.getElementsByName||!C.getElementsByName(S).length}),d.getById?(b.filter.ID=function(e){var t=e.replace(te,ne);return function(e){return e.getAttribute("id")===t}},b.find.ID=function(e,t){if("undefined"!=typeof t.getElementById&&E){var n=t.getElementById(e);return n?[n]:[]}}):(b.filter.ID=function(e){var n=e.replace(te,ne);return function(e){var t="undefined"!=typeof e.getAttributeNode&&e.getAttributeNode("id");return t&&t.value===n}},b.find.ID=function(e,t){if("undefined"!=typeof t.getElementById&&E){var n,r,i,o=t.getElementById(e);if(o){if((n=o.getAttributeNode("id"))&&n.value===e)return[o];i=t.getElementsByName(e),r=0;while(o=i[r++])if((n=o.getAttributeNode("id"))&&n.value===e)return[o]}return[]}}),b.find.TAG=d.getElementsByTagName?function(e,t){return"undefined"!=typeof t.getElementsByTagName?t.getElementsByTagName(e):d.qsa?t.querySelectorAll(e):void 0}:function(e,t){var n,r=[],i=0,o=t.getElementsByTagName(e);if("*"===e){while(n=o[i++])1===n.nodeType&&r.push(n);return r}return o},b.find.CLASS=d.getElementsByClassName&&function(e,t){if("undefined"!=typeof t.getElementsByClassName&&E)return t.getElementsByClassName(e)},s=[],v=[],(d.qsa=K.test(C.querySelectorAll))&&(ce(function(e){var t;a.appendChild(e).innerHTML="<a id='"+S+"'></a><select id='"+S+"-\r\\' msallowcapture=''><option selected=''></option></select>",e.querySelectorAll("[msallowcapture^='']").length&&v.push("[*^$]="+M+"*(?:''|\"\")"),e.querySelectorAll("[selected]").length||v.push("\\["+M+"*(?:value|"+R+")"),e.querySelectorAll("[id~="+S+"-]").length||v.push("~="),(t=C.createElement("input")).setAttribute("name",""),e.appendChild(t),e.querySelectorAll("[name='']").length||v.push("\\["+M+"*name"+M+"*="+M+"*(?:''|\"\")"),e.querySelectorAll(":checked").length||v.push(":checked"),e.querySelectorAll("a#"+S+"+*").length||v.push(".#.+[+~]"),e.querySelectorAll("\\\f"),v.push("[\\r\\n\\f]")}),ce(function(e){e.innerHTML="<a href='' disabled='disabled'></a><select disabled='disabled'><option/></select>";var t=C.createElement("input");t.setAttribute("type","hidden"),e.appendChild(t).setAttribute("name","D"),e.querySelectorAll("[name=d]").length&&v.push("name"+M+"*[*^$|!~]?="),2!==e.querySelectorAll(":enabled").length&&v.push(":enabled",":disabled"),a.appendChild(e).disabled=!0,2!==e.querySelectorAll(":disabled").length&&v.push(":enabled",":disabled"),e.querySelectorAll("*,:x"),v.push(",.*:")})),(d.matchesSelector=K.test(c=a.matches||a.webkitMatchesSelector||a.mozMatchesSelector||a.oMatchesSelector||a.msMatchesSelector))&&ce(function(e){d.disconnectedMatch=c.call(e,"*"),c.call(e,"[s!='']:x"),s.push("!=",F)}),v=v.length&&new RegExp(v.join("|")),s=s.length&&new RegExp(s.join("|")),t=K.test(a.compareDocumentPosition),y=t||K.test(a.contains)?function(e,t){var n=9===e.nodeType?e.documentElement:e,r=t&&t.parentNode;return e===r||!(!r||1!==r.nodeType||!(n.contains?n.contains(r):e.compareDocumentPosition&&16&e.compareDocumentPosition(r)))}:function(e,t){if(t)while(t=t.parentNode)if(t===e)return!0;return!1},j=t?function(e,t){if(e===t)return l=!0,0;var n=!e.compareDocumentPosition-!t.compareDocumentPosition;return n||(1&(n=(e.ownerDocument||e)==(t.ownerDocument||t)?e.compareDocumentPosition(t):1)||!d.sortDetached&&t.compareDocumentPosition(e)===n?e==C||e.ownerDocument==p&&y(p,e)?-1:t==C||t.ownerDocument==p&&y(p,t)?1:u?P(u,e)-P(u,t):0:4&n?-1:1)}:function(e,t){if(e===t)return l=!0,0;var n,r=0,i=e.parentNode,o=t.parentNode,a=[e],s=[t];if(!i||!o)return e==C?-1:t==C?1:i?-1:o?1:u?P(u,e)-P(u,t):0;if(i===o)return pe(e,t);n=e;while(n=n.parentNode)a.unshift(n);n=t;while(n=n.parentNode)s.unshift(n);while(a[r]===s[r])r++;return r?pe(a[r],s[r]):a[r]==p?-1:s[r]==p?1:0}),C},se.matches=function(e,t){return se(e,null,null,t)},se.matchesSelector=function(e,t){if(T(e),d.matchesSelector&&E&&!N[t+" "]&&(!s||!s.test(t))&&(!v||!v.test(t)))try{var n=c.call(e,t);if(n||d.disconnectedMatch||e.document&&11!==e.document.nodeType)return n}catch(e){N(t,!0)}return 0<se(t,C,null,[e]).length},se.contains=function(e,t){return(e.ownerDocument||e)!=C&&T(e),y(e,t)},se.attr=function(e,t){(e.ownerDocument||e)!=C&&T(e);var n=b.attrHandle[t.toLowerCase()],r=n&&D.call(b.attrHandle,t.toLowerCase())?n(e,t,!E):void 0;return void 0!==r?r:d.attributes||!E?e.getAttribute(t):(r=e.getAttributeNode(t))&&r.specified?r.value:null},se.escape=function(e){return(e+"").replace(re,ie)},se.error=function(e){throw new Error("Syntax error, unrecognized expression: "+e)},se.uniqueSort=function(e){var t,n=[],r=0,i=0;if(l=!d.detectDuplicates,u=!d.sortStable&&e.slice(0),e.sort(j),l){while(t=e[i++])t===e[i]&&(r=n.push(i));while(r--)e.splice(n[r],1)}return u=null,e},o=se.getText=function(e){var t,n="",r=0,i=e.nodeType;if(i){if(1===i||9===i||11===i){if("string"==typeof e.textContent)return e.textContent;for(e=e.firstChild;e;e=e.nextSibling)n+=o(e)}else if(3===i||4===i)return e.nodeValue}else while(t=e[r++])n+=o(t);return n},(b=se.selectors={cacheLength:50,createPseudo:le,match:G,attrHandle:{},find:{},relative:{">":{dir:"parentNode",first:!0}," ":{dir:"parentNode"},"+":{dir:"previousSibling",first:!0},"~":{dir:"previousSibling"}},preFilter:{ATTR:function(e){return e[1]=e[1].replace(te,ne),e[3]=(e[3]||e[4]||e[5]||"").replace(te,ne),"~="===e[2]&&(e[3]=" "+e[3]+" "),e.slice(0,4)},CHILD:function(e){return e[1]=e[1].toLowerCase(),"nth"===e[1].slice(0,3)?(e[3]||se.error(e[0]),e[4]=+(e[4]?e[5]+(e[6]||1):2*("even"===e[3]||"odd"===e[3])),e[5]=+(e[7]+e[8]||"odd"===e[3])):e[3]&&se.error(e[0]),e},PSEUDO:function(e){var t,n=!e[6]&&e[2];return G.CHILD.test(e[0])?null:(e[3]?e[2]=e[4]||e[5]||"":n&&X.test(n)&&(t=h(n,!0))&&(t=n.indexOf(")",n.length-t)-n.length)&&(e[0]=e[0].slice(0,t),e[2]=n.slice(0,t)),e.slice(0,3))}},filter:{TAG:function(e){var t=e.replace(te,ne).toLowerCase();return"*"===e?function(){return!0}:function(e){return e.nodeName&&e.nodeName.toLowerCase()===t}},CLASS:function(e){var t=m[e+" "];return t||(t=new RegExp("(^|"+M+")"+e+"("+M+"|$)"))&&m(e,function(e){return t.test("string"==typeof e.className&&e.className||"undefined"!=typeof e.getAttribute&&e.getAttribute("class")||"")})},ATTR:function(n,r,i){return function(e){var t=se.attr(e,n);return null==t?"!="===r:!r||(t+="","="===r?t===i:"!="===r?t!==i:"^="===r?i&&0===t.indexOf(i):"*="===r?i&&-1<t.indexOf(i):"$="===r?i&&t.slice(-i.length)===i:"~="===r?-1<(" "+t.replace(B," ")+" ").indexOf(i):"|="===r&&(t===i||t.slice(0,i.length+1)===i+"-"))}},CHILD:function(h,e,t,g,v){var y="nth"!==h.slice(0,3),m="last"!==h.slice(-4),x="of-type"===e;return 1===g&&0===v?function(e){return!!e.parentNode}:function(e,t,n){var r,i,o,a,s,u,l=y!==m?"nextSibling":"previousSibling",c=e.parentNode,f=x&&e.nodeName.toLowerCase(),p=!n&&!x,d=!1;if(c){if(y){while(l){a=e;while(a=a[l])if(x?a.nodeName.toLowerCase()===f:1===a.nodeType)return!1;u=l="only"===h&&!u&&"nextSibling"}return!0}if(u=[m?c.firstChild:c.lastChild],m&&p){d=(s=(r=(i=(o=(a=c)[S]||(a[S]={}))[a.uniqueID]||(o[a.uniqueID]={}))[h]||[])[0]===k&&r[1])&&r[2],a=s&&c.childNodes[s];while(a=++s&&a&&a[l]||(d=s=0)||u.pop())if(1===a.nodeType&&++d&&a===e){i[h]=[k,s,d];break}}else if(p&&(d=s=(r=(i=(o=(a=e)[S]||(a[S]={}))[a.uniqueID]||(o[a.uniqueID]={}))[h]||[])[0]===k&&r[1]),!1===d)while(a=++s&&a&&a[l]||(d=s=0)||u.pop())if((x?a.nodeName.toLowerCase()===f:1===a.nodeType)&&++d&&(p&&((i=(o=a[S]||(a[S]={}))[a.uniqueID]||(o[a.uniqueID]={}))[h]=[k,d]),a===e))break;return(d-=v)===g||d%g==0&&0<=d/g}}},PSEUDO:function(e,o){var t,a=b.pseudos[e]||b.setFilters[e.toLowerCase()]||se.error("unsupported pseudo: "+e);return a[S]?a(o):1<a.length?(t=[e,e,"",o],b.setFilters.hasOwnProperty(e.toLowerCase())?le(function(e,t){var n,r=a(e,o),i=r.length;while(i--)e[n=P(e,r[i])]=!(t[n]=r[i])}):function(e){return a(e,0,t)}):a}},pseudos:{not:le(function(e){var r=[],i=[],s=f(e.replace($,"$1"));return s[S]?le(function(e,t,n,r){var i,o=s(e,null,r,[]),a=e.length;while(a--)(i=o[a])&&(e[a]=!(t[a]=i))}):function(e,t,n){return r[0]=e,s(r,null,n,i),r[0]=null,!i.pop()}}),has:le(function(t){return function(e){return 0<se(t,e).length}}),contains:le(function(t){return t=t.replace(te,ne),function(e){return-1<(e.textContent||o(e)).indexOf(t)}}),lang:le(function(n){return V.test(n||"")||se.error("unsupported lang: "+n),n=n.replace(te,ne).toLowerCase(),function(e){var t;do{if(t=E?e.lang:e.getAttribute("xml:lang")||e.getAttribute("lang"))return(t=t.toLowerCase())===n||0===t.indexOf(n+"-")}while((e=e.parentNode)&&1===e.nodeType);return!1}}),target:function(e){var t=n.location&&n.location.hash;return t&&t.slice(1)===e.id},root:function(e){return e===a},focus:function(e){return e===C.activeElement&&(!C.hasFocus||C.hasFocus())&&!!(e.type||e.href||~e.tabIndex)},enabled:ge(!1),disabled:ge(!0),checked:function(e){var t=e.nodeName.toLowerCase();return"input"===t&&!!e.checked||"option"===t&&!!e.selected},selected:function(e){return e.parentNode&&e.parentNode.selectedIndex,!0===e.selected},empty:function(e){for(e=e.firstChild;e;e=e.nextSibling)if(e.nodeType<6)return!1;return!0},parent:function(e){return!b.pseudos.empty(e)},header:function(e){return J.test(e.nodeName)},input:function(e){return Q.test(e.nodeName)},button:function(e){var t=e.nodeName.toLowerCase();return"input"===t&&"button"===e.type||"button"===t},text:function(e){var t;return"input"===e.nodeName.toLowerCase()&&"text"===e.type&&(null==(t=e.getAttribute("type"))||"text"===t.toLowerCase())},first:ve(function(){return[0]}),last:ve(function(e,t){return[t-1]}),eq:ve(function(e,t,n){return[n<0?n+t:n]}),even:ve(function(e,t){for(var n=0;n<t;n+=2)e.push(n);return e}),odd:ve(function(e,t){for(var n=1;n<t;n+=2)e.push(n);return e}),lt:ve(function(e,t,n){for(var r=n<0?n+t:t<n?t:n;0<=--r;)e.push(r);return e}),gt:ve(function(e,t,n){for(var r=n<0?n+t:n;++r<t;)e.push(r);return e})}}).pseudos.nth=b.pseudos.eq,{radio:!0,checkbox:!0,file:!0,password:!0,image:!0})b.pseudos[e]=de(e);for(e in{submit:!0,reset:!0})b.pseudos[e]=he(e);function me(){}function xe(e){for(var t=0,n=e.length,r="";t<n;t++)r+=e[t].value;return r}function be(s,e,t){var u=e.dir,l=e.next,c=l||u,f=t&&"parentNode"===c,p=r++;return e.first?function(e,t,n){while(e=e[u])if(1===e.nodeType||f)return s(e,t,n);return!1}:function(e,t,n){var r,i,o,a=[k,p];if(n){while(e=e[u])if((1===e.nodeType||f)&&s(e,t,n))return!0}else while(e=e[u])if(1===e.nodeType||f)if(i=(o=e[S]||(e[S]={}))[e.uniqueID]||(o[e.uniqueID]={}),l&&l===e.nodeName.toLowerCase())e=e[u]||e;else{if((r=i[c])&&r[0]===k&&r[1]===p)return a[2]=r[2];if((i[c]=a)[2]=s(e,t,n))return!0}return!1}}function we(i){return 1<i.length?function(e,t,n){var r=i.length;while(r--)if(!i[r](e,t,n))return!1;return!0}:i[0]}function Te(e,t,n,r,i){for(var o,a=[],s=0,u=e.length,l=null!=t;s<u;s++)(o=e[s])&&(n&&!n(o,r,i)||(a.push(o),l&&t.push(s)));return a}function Ce(d,h,g,v,y,e){return v&&!v[S]&&(v=Ce(v)),y&&!y[S]&&(y=Ce(y,e)),le(function(e,t,n,r){var i,o,a,s=[],u=[],l=t.length,c=e||function(e,t,n){for(var r=0,i=t.length;r<i;r++)se(e,t[r],n);return n}(h||"*",n.nodeType?[n]:n,[]),f=!d||!e&&h?c:Te(c,s,d,n,r),p=g?y||(e?d:l||v)?[]:t:f;if(g&&g(f,p,n,r),v){i=Te(p,u),v(i,[],n,r),o=i.length;while(o--)(a=i[o])&&(p[u[o]]=!(f[u[o]]=a))}if(e){if(y||d){if(y){i=[],o=p.length;while(o--)(a=p[o])&&i.push(f[o]=a);y(null,p=[],i,r)}o=p.length;while(o--)(a=p[o])&&-1<(i=y?P(e,a):s[o])&&(e[i]=!(t[i]=a))}}else p=Te(p===t?p.splice(l,p.length):p),y?y(null,t,p,r):H.apply(t,p)})}function Ee(e){for(var i,t,n,r=e.length,o=b.relative[e[0].type],a=o||b.relative[" "],s=o?1:0,u=be(function(e){return e===i},a,!0),l=be(function(e){return-1<P(i,e)},a,!0),c=[function(e,t,n){var r=!o&&(n||t!==w)||((i=t).nodeType?u(e,t,n):l(e,t,n));return i=null,r}];s<r;s++)if(t=b.relative[e[s].type])c=[be(we(c),t)];else{if((t=b.filter[e[s].type].apply(null,e[s].matches))[S]){for(n=++s;n<r;n++)if(b.relative[e[n].type])break;return Ce(1<s&&we(c),1<s&&xe(e.slice(0,s-1).concat({value:" "===e[s-2].type?"*":""})).replace($,"$1"),t,s<n&&Ee(e.slice(s,n)),n<r&&Ee(e=e.slice(n)),n<r&&xe(e))}c.push(t)}return we(c)}return me.prototype=b.filters=b.pseudos,b.setFilters=new me,h=se.tokenize=function(e,t){var n,r,i,o,a,s,u,l=x[e+" "];if(l)return t?0:l.slice(0);a=e,s=[],u=b.preFilter;while(a){for(o in n&&!(r=_.exec(a))||(r&&(a=a.slice(r[0].length)||a),s.push(i=[])),n=!1,(r=z.exec(a))&&(n=r.shift(),i.push({value:n,type:r[0].replace($," ")}),a=a.slice(n.length)),b.filter)!(r=G[o].exec(a))||u[o]&&!(r=u[o](r))||(n=r.shift(),i.push({value:n,type:o,matches:r}),a=a.slice(n.length));if(!n)break}return t?a.length:a?se.error(e):x(e,s).slice(0)},f=se.compile=function(e,t){var n,v,y,m,x,r,i=[],o=[],a=A[e+" "];if(!a){t||(t=h(e)),n=t.length;while(n--)(a=Ee(t[n]))[S]?i.push(a):o.push(a);(a=A(e,(v=o,m=0<(y=i).length,x=0<v.length,r=function(e,t,n,r,i){var o,a,s,u=0,l="0",c=e&&[],f=[],p=w,d=e||x&&b.find.TAG("*",i),h=k+=null==p?1:Math.random()||.1,g=d.length;for(i&&(w=t==C||t||i);l!==g&&null!=(o=d[l]);l++){if(x&&o){a=0,t||o.ownerDocument==C||(T(o),n=!E);while(s=v[a++])if(s(o,t||C,n)){r.push(o);break}i&&(k=h)}m&&((o=!s&&o)&&u--,e&&c.push(o))}if(u+=l,m&&l!==u){a=0;while(s=y[a++])s(c,f,t,n);if(e){if(0<u)while(l--)c[l]||f[l]||(f[l]=q.call(r));f=Te(f)}H.apply(r,f),i&&!e&&0<f.length&&1<u+y.length&&se.uniqueSort(r)}return i&&(k=h,w=p),c},m?le(r):r))).selector=e}return a},g=se.select=function(e,t,n,r){var i,o,a,s,u,l="function"==typeof e&&e,c=!r&&h(e=l.selector||e);if(n=n||[],1===c.length){if(2<(o=c[0]=c[0].slice(0)).length&&"ID"===(a=o[0]).type&&9===t.nodeType&&E&&b.relative[o[1].type]){if(!(t=(b.find.ID(a.matches[0].replace(te,ne),t)||[])[0]))return n;l&&(t=t.parentNode),e=e.slice(o.shift().value.length)}i=G.needsContext.test(e)?0:o.length;while(i--){if(a=o[i],b.relative[s=a.type])break;if((u=b.find[s])&&(r=u(a.matches[0].replace(te,ne),ee.test(o[0].type)&&ye(t.parentNode)||t))){if(o.splice(i,1),!(e=r.length&&xe(o)))return H.apply(n,r),n;break}}}return(l||f(e,c))(r,t,!E,n,!t||ee.test(e)&&ye(t.parentNode)||t),n},d.sortStable=S.split("").sort(j).join("")===S,d.detectDuplicates=!!l,T(),d.sortDetached=ce(function(e){return 1&e.compareDocumentPosition(C.createElement("fieldset"))}),ce(function(e){return e.innerHTML="<a href='#'></a>","#"===e.firstChild.getAttribute("href")})||fe("type|href|height|width",function(e,t,n){if(!n)return e.getAttribute(t,"type"===t.toLowerCase()?1:2)}),d.attributes&&ce(function(e){return e.innerHTML="<input/>",e.firstChild.setAttribute("value",""),""===e.firstChild.getAttribute("value")})||fe("value",function(e,t,n){if(!n&&"input"===e.nodeName.toLowerCase())return e.defaultValue}),ce(function(e){return null==e.getAttribute("disabled")})||fe(R,function(e,t,n){var r;if(!n)return!0===e[t]?t.toLowerCase():(r=e.getAttributeNode(t))&&r.specified?r.value:null}),se}(C);S.find=d,S.expr=d.selectors,S.expr[":"]=S.expr.pseudos,S.uniqueSort=S.unique=d.uniqueSort,S.text=d.getText,S.isXMLDoc=d.isXML,S.contains=d.contains,S.escapeSelector=d.escape;var h=function(e,t,n){var r=[],i=void 0!==n;while((e=e[t])&&9!==e.nodeType)if(1===e.nodeType){if(i&&S(e).is(n))break;r.push(e)}return r},T=function(e,t){for(var n=[];e;e=e.nextSibling)1===e.nodeType&&e!==t&&n.push(e);return n},k=S.expr.match.needsContext;function A(e,t){return e.nodeName&&e.nodeName.toLowerCase()===t.toLowerCase()}var N=/^<([a-z][^\/\0>:\x20\t\r\n\f]*)[\x20\t\r\n\f]*\/?>(?:<\/\1>|)$/i;function j(e,n,r){return m(n)?S.grep(e,function(e,t){return!!n.call(e,t,e)!==r}):n.nodeType?S.grep(e,function(e){return e===n!==r}):"string"!=typeof n?S.grep(e,function(e){return-1<i.call(n,e)!==r}):S.filter(n,e,r)}S.filter=function(e,t,n){var r=t[0];return n&&(e=":not("+e+")"),1===t.length&&1===r.nodeType?S.find.matchesSelector(r,e)?[r]:[]:S.find.matches(e,S.grep(t,function(e){return 1===e.nodeType}))},S.fn.extend({find:function(e){var t,n,r=this.length,i=this;if("string"!=typeof e)return this.pushStack(S(e).filter(function(){for(t=0;t<r;t++)if(S.contains(i[t],this))return!0}));for(n=this.pushStack([]),t=0;t<r;t++)S.find(e,i[t],n);return 1<r?S.uniqueSort(n):n},filter:function(e){return this.pushStack(j(this,e||[],!1))},not:function(e){return this.pushStack(j(this,e||[],!0))},is:function(e){return!!j(this,"string"==typeof e&&k.test(e)?S(e):e||[],!1).length}});var D,q=/^(?:\s*(<[\w\W]+>)[^>]*|#([\w-]+))$/;(S.fn.init=function(e,t,n){var r,i;if(!e)return this;if(n=n||D,"string"==typeof e){if(!(r="<"===e[0]&&">"===e[e.length-1]&&3<=e.length?[null,e,null]:q.exec(e))||!r[1]&&t)return!t||t.jquery?(t||n).find(e):this.constructor(t).find(e);if(r[1]){if(t=t instanceof S?t[0]:t,S.merge(this,S.parseHTML(r[1],t&&t.nodeType?t.ownerDocument||t:E,!0)),N.test(r[1])&&S.isPlainObject(t))for(r in t)m(this[r])?this[r](t[r]):this.attr(r,t[r]);return this}return(i=E.getElementById(r[2]))&&(this[0]=i,this.length=1),this}return e.nodeType?(this[0]=e,this.length=1,this):m(e)?void 0!==n.ready?n.ready(e):e(S):S.makeArray(e,this)}).prototype=S.fn,D=S(E);var L=/^(?:parents|prev(?:Until|All))/,H={children:!0,contents:!0,next:!0,prev:!0};function O(e,t){while((e=e[t])&&1!==e.nodeType);return e}S.fn.extend({has:function(e){var t=S(e,this),n=t.length;return this.filter(function(){for(var e=0;e<n;e++)if(S.contains(this,t[e]))return!0})},closest:function(e,t){var n,r=0,i=this.length,o=[],a="string"!=typeof e&&S(e);if(!k.test(e))for(;r<i;r++)for(n=this[r];n&&n!==t;n=n.parentNode)if(n.nodeType<11&&(a?-1<a.index(n):1===n.nodeType&&S.find.matchesSelector(n,e))){o.push(n);break}return this.pushStack(1<o.length?S.uniqueSort(o):o)},index:function(e){return e?"string"==typeof e?i.call(S(e),this[0]):i.call(this,e.jquery?e[0]:e):this[0]&&this[0].parentNode?this.first().prevAll().length:-1},add:function(e,t){return this.pushStack(S.uniqueSort(S.merge(this.get(),S(e,t))))},addBack:function(e){return this.add(null==e?this.prevObject:this.prevObject.filter(e))}}),S.each({parent:function(e){var t=e.parentNode;return t&&11!==t.nodeType?t:null},parents:function(e){return h(e,"parentNode")},parentsUntil:function(e,t,n){return h(e,"parentNode",n)},next:function(e){return O(e,"nextSibling")},prev:function(e){return O(e,"previousSibling")},nextAll:function(e){return h(e,"nextSibling")},prevAll:function(e){return h(e,"previousSibling")},nextUntil:function(e,t,n){return h(e,"nextSibling",n)},prevUntil:function(e,t,n){return h(e,"previousSibling",n)},siblings:function(e){return T((e.parentNode||{}).firstChild,e)},children:function(e){return T(e.firstChild)},contents:function(e){return null!=e.contentDocument&&r(e.contentDocument)?e.contentDocument:(A(e,"template")&&(e=e.content||e),S.merge([],e.childNodes))}},function(r,i){S.fn[r]=function(e,t){var n=S.map(this,i,e);return"Until"!==r.slice(-5)&&(t=e),t&&"string"==typeof t&&(n=S.filter(t,n)),1<this.length&&(H[r]||S.uniqueSort(n),L.test(r)&&n.reverse()),this.pushStack(n)}});var P=/[^\x20\t\r\n\f]+/g;function R(e){return e}function M(e){throw e}function I(e,t,n,r){var i;try{e&&m(i=e.promise)?i.call(e).done(t).fail(n):e&&m(i=e.then)?i.call(e,t,n):t.apply(void 0,[e].slice(r))}catch(e){n.apply(void 0,[e])}}S.Callbacks=function(r){var e,n;r="string"==typeof r?(e=r,n={},S.each(e.match(P)||[],function(e,t){n[t]=!0}),n):S.extend({},r);var i,t,o,a,s=[],u=[],l=-1,c=function(){for(a=a||r.once,o=i=!0;u.length;l=-1){t=u.shift();while(++l<s.length)!1===s[l].apply(t[0],t[1])&&r.stopOnFalse&&(l=s.length,t=!1)}r.memory||(t=!1),i=!1,a&&(s=t?[]:"")},f={add:function(){return s&&(t&&!i&&(l=s.length-1,u.push(t)),function n(e){S.each(e,function(e,t){m(t)?r.unique&&f.has(t)||s.push(t):t&&t.length&&"string"!==w(t)&&n(t)})}(arguments),t&&!i&&c()),this},remove:function(){return S.each(arguments,function(e,t){var n;while(-1<(n=S.inArray(t,s,n)))s.splice(n,1),n<=l&&l--}),this},has:function(e){return e?-1<S.inArray(e,s):0<s.length},empty:function(){return s&&(s=[]),this},disable:function(){return a=u=[],s=t="",this},disabled:function(){return!s},lock:function(){return a=u=[],t||i||(s=t=""),this},locked:function(){return!!a},fireWith:function(e,t){return a||(t=[e,(t=t||[]).slice?t.slice():t],u.push(t),i||c()),this},fire:function(){return f.fireWith(this,arguments),this},fired:function(){return!!o}};return f},S.extend({Deferred:function(e){var o=[["notify","progress",S.Callbacks("memory"),S.Callbacks("memory"),2],["resolve","done",S.Callbacks("once memory"),S.Callbacks("once memory"),0,"resolved"],["reject","fail",S.Callbacks("once memory"),S.Callbacks("once memory"),1,"rejected"]],i="pending",a={state:function(){return i},always:function(){return s.done(arguments).fail(arguments),this},"catch":function(e){return a.then(null,e)},pipe:function(){var i=arguments;return S.Deferred(function(r){S.each(o,function(e,t){var n=m(i[t[4]])&&i[t[4]];s[t[1]](function(){var e=n&&n.apply(this,arguments);e&&m(e.promise)?e.promise().progress(r.notify).done(r.resolve).fail(r.reject):r[t[0]+"With"](this,n?[e]:arguments)})}),i=null}).promise()},then:function(t,n,r){var u=0;function l(i,o,a,s){return function(){var n=this,r=arguments,e=function(){var e,t;if(!(i<u)){if((e=a.apply(n,r))===o.promise())throw new TypeError("Thenable self-resolution");t=e&&("object"==typeof e||"function"==typeof e)&&e.then,m(t)?s?t.call(e,l(u,o,R,s),l(u,o,M,s)):(u++,t.call(e,l(u,o,R,s),l(u,o,M,s),l(u,o,R,o.notifyWith))):(a!==R&&(n=void 0,r=[e]),(s||o.resolveWith)(n,r))}},t=s?e:function(){try{e()}catch(e){S.Deferred.exceptionHook&&S.Deferred.exceptionHook(e,t.stackTrace),u<=i+1&&(a!==M&&(n=void 0,r=[e]),o.rejectWith(n,r))}};i?t():(S.Deferred.getStackHook&&(t.stackTrace=S.Deferred.getStackHook()),C.setTimeout(t))}}return S.Deferred(function(e){o[0][3].add(l(0,e,m(r)?r:R,e.notifyWith)),o[1][3].add(l(0,e,m(t)?t:R)),o[2][3].add(l(0,e,m(n)?n:M))}).promise()},promise:function(e){return null!=e?S.extend(e,a):a}},s={};return S.each(o,function(e,t){var n=t[2],r=t[5];a[t[1]]=n.add,r&&n.add(function(){i=r},o[3-e][2].disable,o[3-e][3].disable,o[0][2].lock,o[0][3].lock),n.add(t[3].fire),s[t[0]]=function(){return s[t[0]+"With"](this===s?void 0:this,arguments),this},s[t[0]+"With"]=n.fireWith}),a.promise(s),e&&e.call(s,s),s},when:function(e){var n=arguments.length,t=n,r=Array(t),i=s.call(arguments),o=S.Deferred(),a=function(t){return function(e){r[t]=this,i[t]=1<arguments.length?s.call(arguments):e,--n||o.resolveWith(r,i)}};if(n<=1&&(I(e,o.done(a(t)).resolve,o.reject,!n),"pending"===o.state()||m(i[t]&&i[t].then)))return o.then();while(t--)I(i[t],a(t),o.reject);return o.promise()}});var W=/^(Eval|Internal|Range|Reference|Syntax|Type|URI)Error$/;S.Deferred.exceptionHook=function(e,t){C.console&&C.console.warn&&e&&W.test(e.name)&&C.console.warn("jQuery.Deferred exception: "+e.message,e.stack,t)},S.readyException=function(e){C.setTimeout(function(){throw e})};var F=S.Deferred();function B(){E.removeEventListener("DOMContentLoaded",B),C.removeEventListener("load",B),S.ready()}S.fn.ready=function(e){return F.then(e)["catch"](function(e){S.readyException(e)}),this},S.extend({isReady:!1,readyWait:1,ready:function(e){(!0===e?--S.readyWait:S.isReady)||(S.isReady=!0)!==e&&0<--S.readyWait||F.resolveWith(E,[S])}}),S.ready.then=F.then,"complete"===E.readyState||"loading"!==E.readyState&&!E.documentElement.doScroll?C.setTimeout(S.ready):(E.addEventListener("DOMContentLoaded",B),C.addEventListener("load",B));var $=function(e,t,n,r,i,o,a){var s=0,u=e.length,l=null==n;if("object"===w(n))for(s in i=!0,n)$(e,t,s,n[s],!0,o,a);else if(void 0!==r&&(i=!0,m(r)||(a=!0),l&&(a?(t.call(e,r),t=null):(l=t,t=function(e,t,n){return l.call(S(e),n)})),t))for(;s<u;s++)t(e[s],n,a?r:r.call(e[s],s,t(e[s],n)));return i?e:l?t.call(e):u?t(e[0],n):o},_=/^-ms-/,z=/-([a-z])/g;function U(e,t){return t.toUpperCase()}function X(e){return e.replace(_,"ms-").replace(z,U)}var V=function(e){return 1===e.nodeType||9===e.nodeType||!+e.nodeType};function G(){this.expando=S.expando+G.uid++}G.uid=1,G.prototype={cache:function(e){var t=e[this.expando];return t||(t={},V(e)&&(e.nodeType?e[this.expando]=t:Object.defineProperty(e,this.expando,{value:t,configurable:!0}))),t},set:function(e,t,n){var r,i=this.cache(e);if("string"==typeof t)i[X(t)]=n;else for(r in t)i[X(r)]=t[r];return i},get:function(e,t){return void 0===t?this.cache(e):e[this.expando]&&e[this.expando][X(t)]},access:function(e,t,n){return void 0===t||t&&"string"==typeof t&&void 0===n?this.get(e,t):(this.set(e,t,n),void 0!==n?n:t)},remove:function(e,t){var n,r=e[this.expando];if(void 0!==r){if(void 0!==t){n=(t=Array.isArray(t)?t.map(X):(t=X(t))in r?[t]:t.match(P)||[]).length;while(n--)delete r[t[n]]}(void 0===t||S.isEmptyObject(r))&&(e.nodeType?e[this.expando]=void 0:delete e[this.expando])}},hasData:function(e){var t=e[this.expando];return void 0!==t&&!S.isEmptyObject(t)}};var Y=new G,Q=new G,J=/^(?:\{[\w\W]*\}|\[[\w\W]*\])$/,K=/[A-Z]/g;function Z(e,t,n){var r,i;if(void 0===n&&1===e.nodeType)if(r="data-"+t.replace(K,"-$&").toLowerCase(),"string"==typeof(n=e.getAttribute(r))){try{n="true"===(i=n)||"false"!==i&&("null"===i?null:i===+i+""?+i:J.test(i)?JSON.parse(i):i)}catch(e){}Q.set(e,t,n)}else n=void 0;return n}S.extend({hasData:function(e){return Q.hasData(e)||Y.hasData(e)},data:function(e,t,n){return Q.access(e,t,n)},removeData:function(e,t){Q.remove(e,t)},_data:function(e,t,n){return Y.access(e,t,n)},_removeData:function(e,t){Y.remove(e,t)}}),S.fn.extend({data:function(n,e){var t,r,i,o=this[0],a=o&&o.attributes;if(void 0===n){if(this.length&&(i=Q.get(o),1===o.nodeType&&!Y.get(o,"hasDataAttrs"))){t=a.length;while(t--)a[t]&&0===(r=a[t].name).indexOf("data-")&&(r=X(r.slice(5)),Z(o,r,i[r]));Y.set(o,"hasDataAttrs",!0)}return i}return"object"==typeof n?this.each(function(){Q.set(this,n)}):$(this,function(e){var t;if(o&&void 0===e)return void 0!==(t=Q.get(o,n))?t:void 0!==(t=Z(o,n))?t:void 0;this.each(function(){Q.set(this,n,e)})},null,e,1<arguments.length,null,!0)},removeData:function(e){return this.each(function(){Q.remove(this,e)})}}),S.extend({queue:function(e,t,n){var r;if(e)return t=(t||"fx")+"queue",r=Y.get(e,t),n&&(!r||Array.isArray(n)?r=Y.access(e,t,S.makeArray(n)):r.push(n)),r||[]},dequeue:function(e,t){t=t||"fx";var n=S.queue(e,t),r=n.length,i=n.shift(),o=S._queueHooks(e,t);"inprogress"===i&&(i=n.shift(),r--),i&&("fx"===t&&n.unshift("inprogress"),delete o.stop,i.call(e,function(){S.dequeue(e,t)},o)),!r&&o&&o.empty.fire()},_queueHooks:function(e,t){var n=t+"queueHooks";return Y.get(e,n)||Y.access(e,n,{empty:S.Callbacks("once memory").add(function(){Y.remove(e,[t+"queue",n])})})}}),S.fn.extend({queue:function(t,n){var e=2;return"string"!=typeof t&&(n=t,t="fx",e--),arguments.length<e?S.queue(this[0],t):void 0===n?this:this.each(function(){var e=S.queue(this,t,n);S._queueHooks(this,t),"fx"===t&&"inprogress"!==e[0]&&S.dequeue(this,t)})},dequeue:function(e){return this.each(function(){S.dequeue(this,e)})},clearQueue:function(e){return this.queue(e||"fx",[])},promise:function(e,t){var n,r=1,i=S.Deferred(),o=this,a=this.length,s=function(){--r||i.resolveWith(o,[o])};"string"!=typeof e&&(t=e,e=void 0),e=e||"fx";while(a--)(n=Y.get(o[a],e+"queueHooks"))&&n.empty&&(r++,n.empty.add(s));return s(),i.promise(t)}});var ee=/[+-]?(?:\d*\.|)\d+(?:[eE][+-]?\d+|)/.source,te=new RegExp("^(?:([+-])=|)("+ee+")([a-z%]*)$","i"),ne=["Top","Right","Bottom","Left"],re=E.documentElement,ie=function(e){return S.contains(e.ownerDocument,e)},oe={composed:!0};re.getRootNode&&(ie=function(e){return S.contains(e.ownerDocument,e)||e.getRootNode(oe)===e.ownerDocument});var ae=function(e,t){return"none"===(e=t||e).style.display||""===e.style.display&&ie(e)&&"none"===S.css(e,"display")};function se(e,t,n,r){var i,o,a=20,s=r?function(){return r.cur()}:function(){return S.css(e,t,"")},u=s(),l=n&&n[3]||(S.cssNumber[t]?"":"px"),c=e.nodeType&&(S.cssNumber[t]||"px"!==l&&+u)&&te.exec(S.css(e,t));if(c&&c[3]!==l){u/=2,l=l||c[3],c=+u||1;while(a--)S.style(e,t,c+l),(1-o)*(1-(o=s()/u||.5))<=0&&(a=0),c/=o;c*=2,S.style(e,t,c+l),n=n||[]}return n&&(c=+c||+u||0,i=n[1]?c+(n[1]+1)*n[2]:+n[2],r&&(r.unit=l,r.start=c,r.end=i)),i}var ue={};function le(e,t){for(var n,r,i,o,a,s,u,l=[],c=0,f=e.length;c<f;c++)(r=e[c]).style&&(n=r.style.display,t?("none"===n&&(l[c]=Y.get(r,"display")||null,l[c]||(r.style.display="")),""===r.style.display&&ae(r)&&(l[c]=(u=a=o=void 0,a=(i=r).ownerDocument,s=i.nodeName,(u=ue[s])||(o=a.body.appendChild(a.createElement(s)),u=S.css(o,"display"),o.parentNode.removeChild(o),"none"===u&&(u="block"),ue[s]=u)))):"none"!==n&&(l[c]="none",Y.set(r,"display",n)));for(c=0;c<f;c++)null!=l[c]&&(e[c].style.display=l[c]);return e}S.fn.extend({show:function(){return le(this,!0)},hide:function(){return le(this)},toggle:function(e){return"boolean"==typeof e?e?this.show():this.hide():this.each(function(){ae(this)?S(this).show():S(this).hide()})}});var ce,fe,pe=/^(?:checkbox|radio)$/i,de=/<([a-z][^\/\0>\x20\t\r\n\f]*)/i,he=/^$|^module$|\/(?:java|ecma)script/i;ce=E.createDocumentFragment().appendChild(E.createElement("div")),(fe=E.createElement("input")).setAttribute("type","radio"),fe.setAttribute("checked","checked"),fe.setAttribute("name","t"),ce.appendChild(fe),y.checkClone=ce.cloneNode(!0).cloneNode(!0).lastChild.checked,ce.innerHTML="<textarea>x</textarea>",y.noCloneChecked=!!ce.cloneNode(!0).lastChild.defaultValue,ce.innerHTML="<option></option>",y.option=!!ce.lastChild;var ge={thead:[1,"<table>","</table>"],col:[2,"<table><colgroup>","</colgroup></table>"],tr:[2,"<table><tbody>","</tbody></table>"],td:[3,"<table><tbody><tr>","</tr></tbody></table>"],_default:[0,"",""]};function ve(e,t){var n;return n="undefined"!=typeof e.getElementsByTagName?e.getElementsByTagName(t||"*"):"undefined"!=typeof e.querySelectorAll?e.querySelectorAll(t||"*"):[],void 0===t||t&&A(e,t)?S.merge([e],n):n}function ye(e,t){for(var n=0,r=e.length;n<r;n++)Y.set(e[n],"globalEval",!t||Y.get(t[n],"globalEval"))}ge.tbody=ge.tfoot=ge.colgroup=ge.caption=ge.thead,ge.th=ge.td,y.option||(ge.optgroup=ge.option=[1,"<select multiple='multiple'>","</select>"]);var me=/<|&#?\w+;/;function xe(e,t,n,r,i){for(var o,a,s,u,l,c,f=t.createDocumentFragment(),p=[],d=0,h=e.length;d<h;d++)if((o=e[d])||0===o)if("object"===w(o))S.merge(p,o.nodeType?[o]:o);else if(me.test(o)){a=a||f.appendChild(t.createElement("div")),s=(de.exec(o)||["",""])[1].toLowerCase(),u=ge[s]||ge._default,a.innerHTML=u[1]+S.htmlPrefilter(o)+u[2],c=u[0];while(c--)a=a.lastChild;S.merge(p,a.childNodes),(a=f.firstChild).textContent=""}else p.push(t.createTextNode(o));f.textContent="",d=0;while(o=p[d++])if(r&&-1<S.inArray(o,r))i&&i.push(o);else if(l=ie(o),a=ve(f.appendChild(o),"script"),l&&ye(a),n){c=0;while(o=a[c++])he.test(o.type||"")&&n.push(o)}return f}var be=/^([^.]*)(?:\.(.+)|)/;function we(){return!0}function Te(){return!1}function Ce(e,t){return e===function(){try{return E.activeElement}catch(e){}}()==("focus"===t)}function Ee(e,t,n,r,i,o){var a,s;if("object"==typeof t){for(s in"string"!=typeof n&&(r=r||n,n=void 0),t)Ee(e,s,n,r,t[s],o);return e}if(null==r&&null==i?(i=n,r=n=void 0):null==i&&("string"==typeof n?(i=r,r=void 0):(i=r,r=n,n=void 0)),!1===i)i=Te;else if(!i)return e;return 1===o&&(a=i,(i=function(e){return S().off(e),a.apply(this,arguments)}).guid=a.guid||(a.guid=S.guid++)),e.each(function(){S.event.add(this,t,i,r,n)})}function Se(e,i,o){o?(Y.set(e,i,!1),S.event.add(e,i,{namespace:!1,handler:function(e){var t,n,r=Y.get(this,i);if(1&e.isTrigger&&this[i]){if(r.length)(S.event.special[i]||{}).delegateType&&e.stopPropagation();else if(r=s.call(arguments),Y.set(this,i,r),t=o(this,i),this[i](),r!==(n=Y.get(this,i))||t?Y.set(this,i,!1):n={},r!==n)return e.stopImmediatePropagation(),e.preventDefault(),n&&n.value}else r.length&&(Y.set(this,i,{value:S.event.trigger(S.extend(r[0],S.Event.prototype),r.slice(1),this)}),e.stopImmediatePropagation())}})):void 0===Y.get(e,i)&&S.event.add(e,i,we)}S.event={global:{},add:function(t,e,n,r,i){var o,a,s,u,l,c,f,p,d,h,g,v=Y.get(t);if(V(t)){n.handler&&(n=(o=n).handler,i=o.selector),i&&S.find.matchesSelector(re,i),n.guid||(n.guid=S.guid++),(u=v.events)||(u=v.events=Object.create(null)),(a=v.handle)||(a=v.handle=function(e){return"undefined"!=typeof S&&S.event.triggered!==e.type?S.event.dispatch.apply(t,arguments):void 0}),l=(e=(e||"").match(P)||[""]).length;while(l--)d=g=(s=be.exec(e[l])||[])[1],h=(s[2]||"").split(".").sort(),d&&(f=S.event.special[d]||{},d=(i?f.delegateType:f.bindType)||d,f=S.event.special[d]||{},c=S.extend({type:d,origType:g,data:r,handler:n,guid:n.guid,selector:i,needsContext:i&&S.expr.match.needsContext.test(i),namespace:h.join(".")},o),(p=u[d])||((p=u[d]=[]).delegateCount=0,f.setup&&!1!==f.setup.call(t,r,h,a)||t.addEventListener&&t.addEventListener(d,a)),f.add&&(f.add.call(t,c),c.handler.guid||(c.handler.guid=n.guid)),i?p.splice(p.delegateCount++,0,c):p.push(c),S.event.global[d]=!0)}},remove:function(e,t,n,r,i){var o,a,s,u,l,c,f,p,d,h,g,v=Y.hasData(e)&&Y.get(e);if(v&&(u=v.events)){l=(t=(t||"").match(P)||[""]).length;while(l--)if(d=g=(s=be.exec(t[l])||[])[1],h=(s[2]||"").split(".").sort(),d){f=S.event.special[d]||{},p=u[d=(r?f.delegateType:f.bindType)||d]||[],s=s[2]&&new RegExp("(^|\\.)"+h.join("\\.(?:.*\\.|)")+"(\\.|$)"),a=o=p.length;while(o--)c=p[o],!i&&g!==c.origType||n&&n.guid!==c.guid||s&&!s.test(c.namespace)||r&&r!==c.selector&&("**"!==r||!c.selector)||(p.splice(o,1),c.selector&&p.delegateCount--,f.remove&&f.remove.call(e,c));a&&!p.length&&(f.teardown&&!1!==f.teardown.call(e,h,v.handle)||S.removeEvent(e,d,v.handle),delete u[d])}else for(d in u)S.event.remove(e,d+t[l],n,r,!0);S.isEmptyObject(u)&&Y.remove(e,"handle events")}},dispatch:function(e){var t,n,r,i,o,a,s=new Array(arguments.length),u=S.event.fix(e),l=(Y.get(this,"events")||Object.create(null))[u.type]||[],c=S.event.special[u.type]||{};for(s[0]=u,t=1;t<arguments.length;t++)s[t]=arguments[t];if(u.delegateTarget=this,!c.preDispatch||!1!==c.preDispatch.call(this,u)){a=S.event.handlers.call(this,u,l),t=0;while((i=a[t++])&&!u.isPropagationStopped()){u.currentTarget=i.elem,n=0;while((o=i.handlers[n++])&&!u.isImmediatePropagationStopped())u.rnamespace&&!1!==o.namespace&&!u.rnamespace.test(o.namespace)||(u.handleObj=o,u.data=o.data,void 0!==(r=((S.event.special[o.origType]||{}).handle||o.handler).apply(i.elem,s))&&!1===(u.result=r)&&(u.preventDefault(),u.stopPropagation()))}return c.postDispatch&&c.postDispatch.call(this,u),u.result}},handlers:function(e,t){var n,r,i,o,a,s=[],u=t.delegateCount,l=e.target;if(u&&l.nodeType&&!("click"===e.type&&1<=e.button))for(;l!==this;l=l.parentNode||this)if(1===l.nodeType&&("click"!==e.type||!0!==l.disabled)){for(o=[],a={},n=0;n<u;n++)void 0===a[i=(r=t[n]).selector+" "]&&(a[i]=r.needsContext?-1<S(i,this).index(l):S.find(i,this,null,[l]).length),a[i]&&o.push(r);o.length&&s.push({elem:l,handlers:o})}return l=this,u<t.length&&s.push({elem:l,handlers:t.slice(u)}),s},addProp:function(t,e){Object.defineProperty(S.Event.prototype,t,{enumerable:!0,configurable:!0,get:m(e)?function(){if(this.originalEvent)return e(this.originalEvent)}:function(){if(this.originalEvent)return this.originalEvent[t]},set:function(e){Object.defineProperty(this,t,{enumerable:!0,configurable:!0,writable:!0,value:e})}})},fix:function(e){return e[S.expando]?e:new S.Event(e)},special:{load:{noBubble:!0},click:{setup:function(e){var t=this||e;return pe.test(t.type)&&t.click&&A(t,"input")&&Se(t,"click",we),!1},trigger:function(e){var t=this||e;return pe.test(t.type)&&t.click&&A(t,"input")&&Se(t,"click"),!0},_default:function(e){var t=e.target;return pe.test(t.type)&&t.click&&A(t,"input")&&Y.get(t,"click")||A(t,"a")}},beforeunload:{postDispatch:function(e){void 0!==e.result&&e.originalEvent&&(e.originalEvent.returnValue=e.result)}}}},S.removeEvent=function(e,t,n){e.removeEventListener&&e.removeEventListener(t,n)},S.Event=function(e,t){if(!(this instanceof S.Event))return new S.Event(e,t);e&&e.type?(this.originalEvent=e,this.type=e.type,this.isDefaultPrevented=e.defaultPrevented||void 0===e.defaultPrevented&&!1===e.returnValue?we:Te,this.target=e.target&&3===e.target.nodeType?e.target.parentNode:e.target,this.currentTarget=e.currentTarget,this.relatedTarget=e.relatedTarget):this.type=e,t&&S.extend(this,t),this.timeStamp=e&&e.timeStamp||Date.now(),this[S.expando]=!0},S.Event.prototype={constructor:S.Event,isDefaultPrevented:Te,isPropagationStopped:Te,isImmediatePropagationStopped:Te,isSimulated:!1,preventDefault:function(){var e=this.originalEvent;this.isDefaultPrevented=we,e&&!this.isSimulated&&e.preventDefault()},stopPropagation:function(){var e=this.originalEvent;this.isPropagationStopped=we,e&&!this.isSimulated&&e.stopPropagation()},stopImmediatePropagation:function(){var e=this.originalEvent;this.isImmediatePropagationStopped=we,e&&!this.isSimulated&&e.stopImmediatePropagation(),this.stopPropagation()}},S.each({altKey:!0,bubbles:!0,cancelable:!0,changedTouches:!0,ctrlKey:!0,detail:!0,eventPhase:!0,metaKey:!0,pageX:!0,pageY:!0,shiftKey:!0,view:!0,"char":!0,code:!0,charCode:!0,key:!0,keyCode:!0,button:!0,buttons:!0,clientX:!0,clientY:!0,offsetX:!0,offsetY:!0,pointerId:!0,pointerType:!0,screenX:!0,screenY:!0,targetTouches:!0,toElement:!0,touches:!0,which:!0},S.event.addProp),S.each({focus:"focusin",blur:"focusout"},function(e,t){S.event.special[e]={setup:function(){return Se(this,e,Ce),!1},trigger:function(){return Se(this,e),!0},_default:function(){return!0},delegateType:t}}),S.each({mouseenter:"mouseover",mouseleave:"mouseout",pointerenter:"pointerover",pointerleave:"pointerout"},function(e,i){S.event.special[e]={delegateType:i,bindType:i,handle:function(e){var t,n=e.relatedTarget,r=e.handleObj;return n&&(n===this||S.contains(this,n))||(e.type=r.origType,t=r.handler.apply(this,arguments),e.type=i),t}}}),S.fn.extend({on:function(e,t,n,r){return Ee(this,e,t,n,r)},one:function(e,t,n,r){return Ee(this,e,t,n,r,1)},off:function(e,t,n){var r,i;if(e&&e.preventDefault&&e.handleObj)return r=e.handleObj,S(e.delegateTarget).off(r.namespace?r.origType+"."+r.namespace:r.origType,r.selector,r.handler),this;if("object"==typeof e){for(i in e)this.off(i,t,e[i]);return this}return!1!==t&&"function"!=typeof t||(n=t,t=void 0),!1===n&&(n=Te),this.each(function(){S.event.remove(this,e,n,t)})}});var ke=/<script|<style|<link/i,Ae=/checked\s*(?:[^=]|=\s*.checked.)/i,Ne=/^\s*<!(?:\[CDATA\[|--)|(?:\]\]|--)>\s*$/g;function je(e,t){return A(e,"table")&&A(11!==t.nodeType?t:t.firstChild,"tr")&&S(e).children("tbody")[0]||e}function De(e){return e.type=(null!==e.getAttribute("type"))+"/"+e.type,e}function qe(e){return"true/"===(e.type||"").slice(0,5)?e.type=e.type.slice(5):e.removeAttribute("type"),e}function Le(e,t){var n,r,i,o,a,s;if(1===t.nodeType){if(Y.hasData(e)&&(s=Y.get(e).events))for(i in Y.remove(t,"handle events"),s)for(n=0,r=s[i].length;n<r;n++)S.event.add(t,i,s[i][n]);Q.hasData(e)&&(o=Q.access(e),a=S.extend({},o),Q.set(t,a))}}function He(n,r,i,o){r=g(r);var e,t,a,s,u,l,c=0,f=n.length,p=f-1,d=r[0],h=m(d);if(h||1<f&&"string"==typeof d&&!y.checkClone&&Ae.test(d))return n.each(function(e){var t=n.eq(e);h&&(r[0]=d.call(this,e,t.html())),He(t,r,i,o)});if(f&&(t=(e=xe(r,n[0].ownerDocument,!1,n,o)).firstChild,1===e.childNodes.length&&(e=t),t||o)){for(s=(a=S.map(ve(e,"script"),De)).length;c<f;c++)u=e,c!==p&&(u=S.clone(u,!0,!0),s&&S.merge(a,ve(u,"script"))),i.call(n[c],u,c);if(s)for(l=a[a.length-1].ownerDocument,S.map(a,qe),c=0;c<s;c++)u=a[c],he.test(u.type||"")&&!Y.access(u,"globalEval")&&S.contains(l,u)&&(u.src&&"module"!==(u.type||"").toLowerCase()?S._evalUrl&&!u.noModule&&S._evalUrl(u.src,{nonce:u.nonce||u.getAttribute("nonce")},l):b(u.textContent.replace(Ne,""),u,l))}return n}function Oe(e,t,n){for(var r,i=t?S.filter(t,e):e,o=0;null!=(r=i[o]);o++)n||1!==r.nodeType||S.cleanData(ve(r)),r.parentNode&&(n&&ie(r)&&ye(ve(r,"script")),r.parentNode.removeChild(r));return e}S.extend({htmlPrefilter:function(e){return e},clone:function(e,t,n){var r,i,o,a,s,u,l,c=e.cloneNode(!0),f=ie(e);if(!(y.noCloneChecked||1!==e.nodeType&&11!==e.nodeType||S.isXMLDoc(e)))for(a=ve(c),r=0,i=(o=ve(e)).length;r<i;r++)s=o[r],u=a[r],void 0,"input"===(l=u.nodeName.toLowerCase())&&pe.test(s.type)?u.checked=s.checked:"input"!==l&&"textarea"!==l||(u.defaultValue=s.defaultValue);if(t)if(n)for(o=o||ve(e),a=a||ve(c),r=0,i=o.length;r<i;r++)Le(o[r],a[r]);else Le(e,c);return 0<(a=ve(c,"script")).length&&ye(a,!f&&ve(e,"script")),c},cleanData:function(e){for(var t,n,r,i=S.event.special,o=0;void 0!==(n=e[o]);o++)if(V(n)){if(t=n[Y.expando]){if(t.events)for(r in t.events)i[r]?S.event.remove(n,r):S.removeEvent(n,r,t.handle);n[Y.expando]=void 0}n[Q.expando]&&(n[Q.expando]=void 0)}}}),S.fn.extend({detach:function(e){return Oe(this,e,!0)},remove:function(e){return Oe(this,e)},text:function(e){return $(this,function(e){return void 0===e?S.text(this):this.empty().each(function(){1!==this.nodeType&&11!==this.nodeType&&9!==this.nodeType||(this.textContent=e)})},null,e,arguments.length)},append:function(){return He(this,arguments,function(e){1!==this.nodeType&&11!==this.nodeType&&9!==this.nodeType||je(this,e).appendChild(e)})},prepend:function(){return He(this,arguments,function(e){if(1===this.nodeType||11===this.nodeType||9===this.nodeType){var t=je(this,e);t.insertBefore(e,t.firstChild)}})},before:function(){return He(this,arguments,function(e){this.parentNode&&this.parentNode.insertBefore(e,this)})},after:function(){return He(this,arguments,function(e){this.parentNode&&this.parentNode.insertBefore(e,this.nextSibling)})},empty:function(){for(var e,t=0;null!=(e=this[t]);t++)1===e.nodeType&&(S.cleanData(ve(e,!1)),e.textContent="");return this},clone:function(e,t){return e=null!=e&&e,t=null==t?e:t,this.map(function(){return S.clone(this,e,t)})},html:function(e){return $(this,function(e){var t=this[0]||{},n=0,r=this.length;if(void 0===e&&1===t.nodeType)return t.innerHTML;if("string"==typeof e&&!ke.test(e)&&!ge[(de.exec(e)||["",""])[1].toLowerCase()]){e=S.htmlPrefilter(e);try{for(;n<r;n++)1===(t=this[n]||{}).nodeType&&(S.cleanData(ve(t,!1)),t.innerHTML=e);t=0}catch(e){}}t&&this.empty().append(e)},null,e,arguments.length)},replaceWith:function(){var n=[];return He(this,arguments,function(e){var t=this.parentNode;S.inArray(this,n)<0&&(S.cleanData(ve(this)),t&&t.replaceChild(e,this))},n)}}),S.each({appendTo:"append",prependTo:"prepend",insertBefore:"before",insertAfter:"after",replaceAll:"replaceWith"},function(e,a){S.fn[e]=function(e){for(var t,n=[],r=S(e),i=r.length-1,o=0;o<=i;o++)t=o===i?this:this.clone(!0),S(r[o])[a](t),u.apply(n,t.get());return this.pushStack(n)}});var Pe=new RegExp("^("+ee+")(?!px)[a-z%]+$","i"),Re=function(e){var t=e.ownerDocument.defaultView;return t&&t.opener||(t=C),t.getComputedStyle(e)},Me=function(e,t,n){var r,i,o={};for(i in t)o[i]=e.style[i],e.style[i]=t[i];for(i in r=n.call(e),t)e.style[i]=o[i];return r},Ie=new RegExp(ne.join("|"),"i");function We(e,t,n){var r,i,o,a,s=e.style;return(n=n||Re(e))&&(""!==(a=n.getPropertyValue(t)||n[t])||ie(e)||(a=S.style(e,t)),!y.pixelBoxStyles()&&Pe.test(a)&&Ie.test(t)&&(r=s.width,i=s.minWidth,o=s.maxWidth,s.minWidth=s.maxWidth=s.width=a,a=n.width,s.width=r,s.minWidth=i,s.maxWidth=o)),void 0!==a?a+"":a}function Fe(e,t){return{get:function(){if(!e())return(this.get=t).apply(this,arguments);delete this.get}}}!function(){function e(){if(l){u.style.cssText="position:absolute;left:-11111px;width:60px;margin-top:1px;padding:0;border:0",l.style.cssText="position:relative;display:block;box-sizing:border-box;overflow:scroll;margin:auto;border:1px;padding:1px;width:60%;top:1%",re.appendChild(u).appendChild(l);var e=C.getComputedStyle(l);n="1%"!==e.top,s=12===t(e.marginLeft),l.style.right="60%",o=36===t(e.right),r=36===t(e.width),l.style.position="absolute",i=12===t(l.offsetWidth/3),re.removeChild(u),l=null}}function t(e){return Math.round(parseFloat(e))}var n,r,i,o,a,s,u=E.createElement("div"),l=E.createElement("div");l.style&&(l.style.backgroundClip="content-box",l.cloneNode(!0).style.backgroundClip="",y.clearCloneStyle="content-box"===l.style.backgroundClip,S.extend(y,{boxSizingReliable:function(){return e(),r},pixelBoxStyles:function(){return e(),o},pixelPosition:function(){return e(),n},reliableMarginLeft:function(){return e(),s},scrollboxSize:function(){return e(),i},reliableTrDimensions:function(){var e,t,n,r;return null==a&&(e=E.createElement("table"),t=E.createElement("tr"),n=E.createElement("div"),e.style.cssText="position:absolute;left:-11111px;border-collapse:separate",t.style.cssText="border:1px solid",t.style.height="1px",n.style.height="9px",n.style.display="block",re.appendChild(e).appendChild(t).appendChild(n),r=C.getComputedStyle(t),a=parseInt(r.height,10)+parseInt(r.borderTopWidth,10)+parseInt(r.borderBottomWidth,10)===t.offsetHeight,re.removeChild(e)),a}}))}();var Be=["Webkit","Moz","ms"],$e=E.createElement("div").style,_e={};function ze(e){var t=S.cssProps[e]||_e[e];return t||(e in $e?e:_e[e]=function(e){var t=e[0].toUpperCase()+e.slice(1),n=Be.length;while(n--)if((e=Be[n]+t)in $e)return e}(e)||e)}var Ue=/^(none|table(?!-c[ea]).+)/,Xe=/^--/,Ve={position:"absolute",visibility:"hidden",display:"block"},Ge={letterSpacing:"0",fontWeight:"400"};function Ye(e,t,n){var r=te.exec(t);return r?Math.max(0,r[2]-(n||0))+(r[3]||"px"):t}function Qe(e,t,n,r,i,o){var a="width"===t?1:0,s=0,u=0;if(n===(r?"border":"content"))return 0;for(;a<4;a+=2)"margin"===n&&(u+=S.css(e,n+ne[a],!0,i)),r?("content"===n&&(u-=S.css(e,"padding"+ne[a],!0,i)),"margin"!==n&&(u-=S.css(e,"border"+ne[a]+"Width",!0,i))):(u+=S.css(e,"padding"+ne[a],!0,i),"padding"!==n?u+=S.css(e,"border"+ne[a]+"Width",!0,i):s+=S.css(e,"border"+ne[a]+"Width",!0,i));return!r&&0<=o&&(u+=Math.max(0,Math.ceil(e["offset"+t[0].toUpperCase()+t.slice(1)]-o-u-s-.5))||0),u}function Je(e,t,n){var r=Re(e),i=(!y.boxSizingReliable()||n)&&"border-box"===S.css(e,"boxSizing",!1,r),o=i,a=We(e,t,r),s="offset"+t[0].toUpperCase()+t.slice(1);if(Pe.test(a)){if(!n)return a;a="auto"}return(!y.boxSizingReliable()&&i||!y.reliableTrDimensions()&&A(e,"tr")||"auto"===a||!parseFloat(a)&&"inline"===S.css(e,"display",!1,r))&&e.getClientRects().length&&(i="border-box"===S.css(e,"boxSizing",!1,r),(o=s in e)&&(a=e[s])),(a=parseFloat(a)||0)+Qe(e,t,n||(i?"border":"content"),o,r,a)+"px"}function Ke(e,t,n,r,i){return new Ke.prototype.init(e,t,n,r,i)}S.extend({cssHooks:{opacity:{get:function(e,t){if(t){var n=We(e,"opacity");return""===n?"1":n}}}},cssNumber:{animationIterationCount:!0,columnCount:!0,fillOpacity:!0,flexGrow:!0,flexShrink:!0,fontWeight:!0,gridArea:!0,gridColumn:!0,gridColumnEnd:!0,gridColumnStart:!0,gridRow:!0,gridRowEnd:!0,gridRowStart:!0,lineHeight:!0,opacity:!0,order:!0,orphans:!0,widows:!0,zIndex:!0,zoom:!0},cssProps:{},style:function(e,t,n,r){if(e&&3!==e.nodeType&&8!==e.nodeType&&e.style){var i,o,a,s=X(t),u=Xe.test(t),l=e.style;if(u||(t=ze(s)),a=S.cssHooks[t]||S.cssHooks[s],void 0===n)return a&&"get"in a&&void 0!==(i=a.get(e,!1,r))?i:l[t];"string"===(o=typeof n)&&(i=te.exec(n))&&i[1]&&(n=se(e,t,i),o="number"),null!=n&&n==n&&("number"!==o||u||(n+=i&&i[3]||(S.cssNumber[s]?"":"px")),y.clearCloneStyle||""!==n||0!==t.indexOf("background")||(l[t]="inherit"),a&&"set"in a&&void 0===(n=a.set(e,n,r))||(u?l.setProperty(t,n):l[t]=n))}},css:function(e,t,n,r){var i,o,a,s=X(t);return Xe.test(t)||(t=ze(s)),(a=S.cssHooks[t]||S.cssHooks[s])&&"get"in a&&(i=a.get(e,!0,n)),void 0===i&&(i=We(e,t,r)),"normal"===i&&t in Ge&&(i=Ge[t]),""===n||n?(o=parseFloat(i),!0===n||isFinite(o)?o||0:i):i}}),S.each(["height","width"],function(e,u){S.cssHooks[u]={get:function(e,t,n){if(t)return!Ue.test(S.css(e,"display"))||e.getClientRects().length&&e.getBoundingClientRect().width?Je(e,u,n):Me(e,Ve,function(){return Je(e,u,n)})},set:function(e,t,n){var r,i=Re(e),o=!y.scrollboxSize()&&"absolute"===i.position,a=(o||n)&&"border-box"===S.css(e,"boxSizing",!1,i),s=n?Qe(e,u,n,a,i):0;return a&&o&&(s-=Math.ceil(e["offset"+u[0].toUpperCase()+u.slice(1)]-parseFloat(i[u])-Qe(e,u,"border",!1,i)-.5)),s&&(r=te.exec(t))&&"px"!==(r[3]||"px")&&(e.style[u]=t,t=S.css(e,u)),Ye(0,t,s)}}}),S.cssHooks.marginLeft=Fe(y.reliableMarginLeft,function(e,t){if(t)return(parseFloat(We(e,"marginLeft"))||e.getBoundingClientRect().left-Me(e,{marginLeft:0},function(){return e.getBoundingClientRect().left}))+"px"}),S.each({margin:"",padding:"",border:"Width"},function(i,o){S.cssHooks[i+o]={expand:function(e){for(var t=0,n={},r="string"==typeof e?e.split(" "):[e];t<4;t++)n[i+ne[t]+o]=r[t]||r[t-2]||r[0];return n}},"margin"!==i&&(S.cssHooks[i+o].set=Ye)}),S.fn.extend({css:function(e,t){return $(this,function(e,t,n){var r,i,o={},a=0;if(Array.isArray(t)){for(r=Re(e),i=t.length;a<i;a++)o[t[a]]=S.css(e,t[a],!1,r);return o}return void 0!==n?S.style(e,t,n):S.css(e,t)},e,t,1<arguments.length)}}),((S.Tween=Ke).prototype={constructor:Ke,init:function(e,t,n,r,i,o){this.elem=e,this.prop=n,this.easing=i||S.easing._default,this.options=t,this.start=this.now=this.cur(),this.end=r,this.unit=o||(S.cssNumber[n]?"":"px")},cur:function(){var e=Ke.propHooks[this.prop];return e&&e.get?e.get(this):Ke.propHooks._default.get(this)},run:function(e){var t,n=Ke.propHooks[this.prop];return this.options.duration?this.pos=t=S.easing[this.easing](e,this.options.duration*e,0,1,this.options.duration):this.pos=t=e,this.now=(this.end-this.start)*t+this.start,this.options.step&&this.options.step.call(this.elem,this.now,this),n&&n.set?n.set(this):Ke.propHooks._default.set(this),this}}).init.prototype=Ke.prototype,(Ke.propHooks={_default:{get:function(e){var t;return 1!==e.elem.nodeType||null!=e.elem[e.prop]&&null==e.elem.style[e.prop]?e.elem[e.prop]:(t=S.css(e.elem,e.prop,""))&&"auto"!==t?t:0},set:function(e){S.fx.step[e.prop]?S.fx.step[e.prop](e):1!==e.elem.nodeType||!S.cssHooks[e.prop]&&null==e.elem.style[ze(e.prop)]?e.elem[e.prop]=e.now:S.style(e.elem,e.prop,e.now+e.unit)}}}).scrollTop=Ke.propHooks.scrollLeft={set:function(e){e.elem.nodeType&&e.elem.parentNode&&(e.elem[e.prop]=e.now)}},S.easing={linear:function(e){return e},swing:function(e){return.5-Math.cos(e*Math.PI)/2},_default:"swing"},S.fx=Ke.prototype.init,S.fx.step={};var Ze,et,tt,nt,rt=/^(?:toggle|show|hide)$/,it=/queueHooks$/;function ot(){et&&(!1===E.hidden&&C.requestAnimationFrame?C.requestAnimationFrame(ot):C.setTimeout(ot,S.fx.interval),S.fx.tick())}function at(){return C.setTimeout(function(){Ze=void 0}),Ze=Date.now()}function st(e,t){var n,r=0,i={height:e};for(t=t?1:0;r<4;r+=2-t)i["margin"+(n=ne[r])]=i["padding"+n]=e;return t&&(i.opacity=i.width=e),i}function ut(e,t,n){for(var r,i=(lt.tweeners[t]||[]).concat(lt.tweeners["*"]),o=0,a=i.length;o<a;o++)if(r=i[o].call(n,t,e))return r}function lt(o,e,t){var n,a,r=0,i=lt.prefilters.length,s=S.Deferred().always(function(){delete u.elem}),u=function(){if(a)return!1;for(var e=Ze||at(),t=Math.max(0,l.startTime+l.duration-e),n=1-(t/l.duration||0),r=0,i=l.tweens.length;r<i;r++)l.tweens[r].run(n);return s.notifyWith(o,[l,n,t]),n<1&&i?t:(i||s.notifyWith(o,[l,1,0]),s.resolveWith(o,[l]),!1)},l=s.promise({elem:o,props:S.extend({},e),opts:S.extend(!0,{specialEasing:{},easing:S.easing._default},t),originalProperties:e,originalOptions:t,startTime:Ze||at(),duration:t.duration,tweens:[],createTween:function(e,t){var n=S.Tween(o,l.opts,e,t,l.opts.specialEasing[e]||l.opts.easing);return l.tweens.push(n),n},stop:function(e){var t=0,n=e?l.tweens.length:0;if(a)return this;for(a=!0;t<n;t++)l.tweens[t].run(1);return e?(s.notifyWith(o,[l,1,0]),s.resolveWith(o,[l,e])):s.rejectWith(o,[l,e]),this}}),c=l.props;for(!function(e,t){var n,r,i,o,a;for(n in e)if(i=t[r=X(n)],o=e[n],Array.isArray(o)&&(i=o[1],o=e[n]=o[0]),n!==r&&(e[r]=o,delete e[n]),(a=S.cssHooks[r])&&"expand"in a)for(n in o=a.expand(o),delete e[r],o)n in e||(e[n]=o[n],t[n]=i);else t[r]=i}(c,l.opts.specialEasing);r<i;r++)if(n=lt.prefilters[r].call(l,o,c,l.opts))return m(n.stop)&&(S._queueHooks(l.elem,l.opts.queue).stop=n.stop.bind(n)),n;return S.map(c,ut,l),m(l.opts.start)&&l.opts.start.call(o,l),l.progress(l.opts.progress).done(l.opts.done,l.opts.complete).fail(l.opts.fail).always(l.opts.always),S.fx.timer(S.extend(u,{elem:o,anim:l,queue:l.opts.queue})),l}S.Animation=S.extend(lt,{tweeners:{"*":[function(e,t){var n=this.createTween(e,t);return se(n.elem,e,te.exec(t),n),n}]},tweener:function(e,t){m(e)?(t=e,e=["*"]):e=e.match(P);for(var n,r=0,i=e.length;r<i;r++)n=e[r],lt.tweeners[n]=lt.tweeners[n]||[],lt.tweeners[n].unshift(t)},prefilters:[function(e,t,n){var r,i,o,a,s,u,l,c,f="width"in t||"height"in t,p=this,d={},h=e.style,g=e.nodeType&&ae(e),v=Y.get(e,"fxshow");for(r in n.queue||(null==(a=S._queueHooks(e,"fx")).unqueued&&(a.unqueued=0,s=a.empty.fire,a.empty.fire=function(){a.unqueued||s()}),a.unqueued++,p.always(function(){p.always(function(){a.unqueued--,S.queue(e,"fx").length||a.empty.fire()})})),t)if(i=t[r],rt.test(i)){if(delete t[r],o=o||"toggle"===i,i===(g?"hide":"show")){if("show"!==i||!v||void 0===v[r])continue;g=!0}d[r]=v&&v[r]||S.style(e,r)}if((u=!S.isEmptyObject(t))||!S.isEmptyObject(d))for(r in f&&1===e.nodeType&&(n.overflow=[h.overflow,h.overflowX,h.overflowY],null==(l=v&&v.display)&&(l=Y.get(e,"display")),"none"===(c=S.css(e,"display"))&&(l?c=l:(le([e],!0),l=e.style.display||l,c=S.css(e,"display"),le([e]))),("inline"===c||"inline-block"===c&&null!=l)&&"none"===S.css(e,"float")&&(u||(p.done(function(){h.display=l}),null==l&&(c=h.display,l="none"===c?"":c)),h.display="inline-block")),n.overflow&&(h.overflow="hidden",p.always(function(){h.overflow=n.overflow[0],h.overflowX=n.overflow[1],h.overflowY=n.overflow[2]})),u=!1,d)u||(v?"hidden"in v&&(g=v.hidden):v=Y.access(e,"fxshow",{display:l}),o&&(v.hidden=!g),g&&le([e],!0),p.done(function(){for(r in g||le([e]),Y.remove(e,"fxshow"),d)S.style(e,r,d[r])})),u=ut(g?v[r]:0,r,p),r in v||(v[r]=u.start,g&&(u.end=u.start,u.start=0))}],prefilter:function(e,t){t?lt.prefilters.unshift(e):lt.prefilters.push(e)}}),S.speed=function(e,t,n){var r=e&&"object"==typeof e?S.extend({},e):{complete:n||!n&&t||m(e)&&e,duration:e,easing:n&&t||t&&!m(t)&&t};return S.fx.off?r.duration=0:"number"!=typeof r.duration&&(r.duration in S.fx.speeds?r.duration=S.fx.speeds[r.duration]:r.duration=S.fx.speeds._default),null!=r.queue&&!0!==r.queue||(r.queue="fx"),r.old=r.complete,r.complete=function(){m(r.old)&&r.old.call(this),r.queue&&S.dequeue(this,r.queue)},r},S.fn.extend({fadeTo:function(e,t,n,r){return this.filter(ae).css("opacity",0).show().end().animate({opacity:t},e,n,r)},animate:function(t,e,n,r){var i=S.isEmptyObject(t),o=S.speed(e,n,r),a=function(){var e=lt(this,S.extend({},t),o);(i||Y.get(this,"finish"))&&e.stop(!0)};return a.finish=a,i||!1===o.queue?this.each(a):this.queue(o.queue,a)},stop:function(i,e,o){var a=function(e){var t=e.stop;delete e.stop,t(o)};return"string"!=typeof i&&(o=e,e=i,i=void 0),e&&this.queue(i||"fx",[]),this.each(function(){var e=!0,t=null!=i&&i+"queueHooks",n=S.timers,r=Y.get(this);if(t)r[t]&&r[t].stop&&a(r[t]);else for(t in r)r[t]&&r[t].stop&&it.test(t)&&a(r[t]);for(t=n.length;t--;)n[t].elem!==this||null!=i&&n[t].queue!==i||(n[t].anim.stop(o),e=!1,n.splice(t,1));!e&&o||S.dequeue(this,i)})},finish:function(a){return!1!==a&&(a=a||"fx"),this.each(function(){var e,t=Y.get(this),n=t[a+"queue"],r=t[a+"queueHooks"],i=S.timers,o=n?n.length:0;for(t.finish=!0,S.queue(this,a,[]),r&&r.stop&&r.stop.call(this,!0),e=i.length;e--;)i[e].elem===this&&i[e].queue===a&&(i[e].anim.stop(!0),i.splice(e,1));for(e=0;e<o;e++)n[e]&&n[e].finish&&n[e].finish.call(this);delete t.finish})}}),S.each(["toggle","show","hide"],function(e,r){var i=S.fn[r];S.fn[r]=function(e,t,n){return null==e||"boolean"==typeof e?i.apply(this,arguments):this.animate(st(r,!0),e,t,n)}}),S.each({slideDown:st("show"),slideUp:st("hide"),slideToggle:st("toggle"),fadeIn:{opacity:"show"},fadeOut:{opacity:"hide"},fadeToggle:{opacity:"toggle"}},function(e,r){S.fn[e]=function(e,t,n){return this.animate(r,e,t,n)}}),S.timers=[],S.fx.tick=function(){var e,t=0,n=S.timers;for(Ze=Date.now();t<n.length;t++)(e=n[t])()||n[t]!==e||n.splice(t--,1);n.length||S.fx.stop(),Ze=void 0},S.fx.timer=function(e){S.timers.push(e),S.fx.start()},S.fx.interval=13,S.fx.start=function(){et||(et=!0,ot())},S.fx.stop=function(){et=null},S.fx.speeds={slow:600,fast:200,_default:400},S.fn.delay=function(r,e){return r=S.fx&&S.fx.speeds[r]||r,e=e||"fx",this.queue(e,function(e,t){var n=C.setTimeout(e,r);t.stop=function(){C.clearTimeout(n)}})},tt=E.createElement("input"),nt=E.createElement("select").appendChild(E.createElement("option")),tt.type="checkbox",y.checkOn=""!==tt.value,y.optSelected=nt.selected,(tt=E.createElement("input")).value="t",tt.type="radio",y.radioValue="t"===tt.value;var ct,ft=S.expr.attrHandle;S.fn.extend({attr:function(e,t){return $(this,S.attr,e,t,1<arguments.length)},removeAttr:function(e){return this.each(function(){S.removeAttr(this,e)})}}),S.extend({attr:function(e,t,n){var r,i,o=e.nodeType;if(3!==o&&8!==o&&2!==o)return"undefined"==typeof e.getAttribute?S.prop(e,t,n):(1===o&&S.isXMLDoc(e)||(i=S.attrHooks[t.toLowerCase()]||(S.expr.match.bool.test(t)?ct:void 0)),void 0!==n?null===n?void S.removeAttr(e,t):i&&"set"in i&&void 0!==(r=i.set(e,n,t))?r:(e.setAttribute(t,n+""),n):i&&"get"in i&&null!==(r=i.get(e,t))?r:null==(r=S.find.attr(e,t))?void 0:r)},attrHooks:{type:{set:function(e,t){if(!y.radioValue&&"radio"===t&&A(e,"input")){var n=e.value;return e.setAttribute("type",t),n&&(e.value=n),t}}}},removeAttr:function(e,t){var n,r=0,i=t&&t.match(P);if(i&&1===e.nodeType)while(n=i[r++])e.removeAttribute(n)}}),ct={set:function(e,t,n){return!1===t?S.removeAttr(e,n):e.setAttribute(n,n),n}},S.each(S.expr.match.bool.source.match(/\w+/g),function(e,t){var a=ft[t]||S.find.attr;ft[t]=function(e,t,n){var r,i,o=t.toLowerCase();return n||(i=ft[o],ft[o]=r,r=null!=a(e,t,n)?o:null,ft[o]=i),r}});var pt=/^(?:input|select|textarea|button)$/i,dt=/^(?:a|area)$/i;function ht(e){return(e.match(P)||[]).join(" ")}function gt(e){return e.getAttribute&&e.getAttribute("class")||""}function vt(e){return Array.isArray(e)?e:"string"==typeof e&&e.match(P)||[]}S.fn.extend({prop:function(e,t){return $(this,S.prop,e,t,1<arguments.length)},removeProp:function(e){return this.each(function(){delete this[S.propFix[e]||e]})}}),S.extend({prop:function(e,t,n){var r,i,o=e.nodeType;if(3!==o&&8!==o&&2!==o)return 1===o&&S.isXMLDoc(e)||(t=S.propFix[t]||t,i=S.propHooks[t]),void 0!==n?i&&"set"in i&&void 0!==(r=i.set(e,n,t))?r:e[t]=n:i&&"get"in i&&null!==(r=i.get(e,t))?r:e[t]},propHooks:{tabIndex:{get:function(e){var t=S.find.attr(e,"tabindex");return t?parseInt(t,10):pt.test(e.nodeName)||dt.test(e.nodeName)&&e.href?0:-1}}},propFix:{"for":"htmlFor","class":"className"}}),y.optSelected||(S.propHooks.selected={get:function(e){var t=e.parentNode;return t&&t.parentNode&&t.parentNode.selectedIndex,null},set:function(e){var t=e.parentNode;t&&(t.selectedIndex,t.parentNode&&t.parentNode.selectedIndex)}}),S.each(["tabIndex","readOnly","maxLength","cellSpacing","cellPadding","rowSpan","colSpan","useMap","frameBorder","contentEditable"],function(){S.propFix[this.toLowerCase()]=this}),S.fn.extend({addClass:function(t){var e,n,r,i,o,a,s,u=0;if(m(t))return this.each(function(e){S(this).addClass(t.call(this,e,gt(this)))});if((e=vt(t)).length)while(n=this[u++])if(i=gt(n),r=1===n.nodeType&&" "+ht(i)+" "){a=0;while(o=e[a++])r.indexOf(" "+o+" ")<0&&(r+=o+" ");i!==(s=ht(r))&&n.setAttribute("class",s)}return this},removeClass:function(t){var e,n,r,i,o,a,s,u=0;if(m(t))return this.each(function(e){S(this).removeClass(t.call(this,e,gt(this)))});if(!arguments.length)return this.attr("class","");if((e=vt(t)).length)while(n=this[u++])if(i=gt(n),r=1===n.nodeType&&" "+ht(i)+" "){a=0;while(o=e[a++])while(-1<r.indexOf(" "+o+" "))r=r.replace(" "+o+" "," ");i!==(s=ht(r))&&n.setAttribute("class",s)}return this},toggleClass:function(i,t){var o=typeof i,a="string"===o||Array.isArray(i);return"boolean"==typeof t&&a?t?this.addClass(i):this.removeClass(i):m(i)?this.each(function(e){S(this).toggleClass(i.call(this,e,gt(this),t),t)}):this.each(function(){var e,t,n,r;if(a){t=0,n=S(this),r=vt(i);while(e=r[t++])n.hasClass(e)?n.removeClass(e):n.addClass(e)}else void 0!==i&&"boolean"!==o||((e=gt(this))&&Y.set(this,"__className__",e),this.setAttribute&&this.setAttribute("class",e||!1===i?"":Y.get(this,"__className__")||""))})},hasClass:function(e){var t,n,r=0;t=" "+e+" ";while(n=this[r++])if(1===n.nodeType&&-1<(" "+ht(gt(n))+" ").indexOf(t))return!0;return!1}});var yt=/\r/g;S.fn.extend({val:function(n){var r,e,i,t=this[0];return arguments.length?(i=m(n),this.each(function(e){var t;1===this.nodeType&&(null==(t=i?n.call(this,e,S(this).val()):n)?t="":"number"==typeof t?t+="":Array.isArray(t)&&(t=S.map(t,function(e){return null==e?"":e+""})),(r=S.valHooks[this.type]||S.valHooks[this.nodeName.toLowerCase()])&&"set"in r&&void 0!==r.set(this,t,"value")||(this.value=t))})):t?(r=S.valHooks[t.type]||S.valHooks[t.nodeName.toLowerCase()])&&"get"in r&&void 0!==(e=r.get(t,"value"))?e:"string"==typeof(e=t.value)?e.replace(yt,""):null==e?"":e:void 0}}),S.extend({valHooks:{option:{get:function(e){var t=S.find.attr(e,"value");return null!=t?t:ht(S.text(e))}},select:{get:function(e){var t,n,r,i=e.options,o=e.selectedIndex,a="select-one"===e.type,s=a?null:[],u=a?o+1:i.length;for(r=o<0?u:a?o:0;r<u;r++)if(((n=i[r]).selected||r===o)&&!n.disabled&&(!n.parentNode.disabled||!A(n.parentNode,"optgroup"))){if(t=S(n).val(),a)return t;s.push(t)}return s},set:function(e,t){var n,r,i=e.options,o=S.makeArray(t),a=i.length;while(a--)((r=i[a]).selected=-1<S.inArray(S.valHooks.option.get(r),o))&&(n=!0);return n||(e.selectedIndex=-1),o}}}}),S.each(["radio","checkbox"],function(){S.valHooks[this]={set:function(e,t){if(Array.isArray(t))return e.checked=-1<S.inArray(S(e).val(),t)}},y.checkOn||(S.valHooks[this].get=function(e){return null===e.getAttribute("value")?"on":e.value})}),y.focusin="onfocusin"in C;var mt=/^(?:focusinfocus|focusoutblur)$/,xt=function(e){e.stopPropagation()};S.extend(S.event,{trigger:function(e,t,n,r){var i,o,a,s,u,l,c,f,p=[n||E],d=v.call(e,"type")?e.type:e,h=v.call(e,"namespace")?e.namespace.split("."):[];if(o=f=a=n=n||E,3!==n.nodeType&&8!==n.nodeType&&!mt.test(d+S.event.triggered)&&(-1<d.indexOf(".")&&(d=(h=d.split(".")).shift(),h.sort()),u=d.indexOf(":")<0&&"on"+d,(e=e[S.expando]?e:new S.Event(d,"object"==typeof e&&e)).isTrigger=r?2:3,e.namespace=h.join("."),e.rnamespace=e.namespace?new RegExp("(^|\\.)"+h.join("\\.(?:.*\\.|)")+"(\\.|$)"):null,e.result=void 0,e.target||(e.target=n),t=null==t?[e]:S.makeArray(t,[e]),c=S.event.special[d]||{},r||!c.trigger||!1!==c.trigger.apply(n,t))){if(!r&&!c.noBubble&&!x(n)){for(s=c.delegateType||d,mt.test(s+d)||(o=o.parentNode);o;o=o.parentNode)p.push(o),a=o;a===(n.ownerDocument||E)&&p.push(a.defaultView||a.parentWindow||C)}i=0;while((o=p[i++])&&!e.isPropagationStopped())f=o,e.type=1<i?s:c.bindType||d,(l=(Y.get(o,"events")||Object.create(null))[e.type]&&Y.get(o,"handle"))&&l.apply(o,t),(l=u&&o[u])&&l.apply&&V(o)&&(e.result=l.apply(o,t),!1===e.result&&e.preventDefault());return e.type=d,r||e.isDefaultPrevented()||c._default&&!1!==c._default.apply(p.pop(),t)||!V(n)||u&&m(n[d])&&!x(n)&&((a=n[u])&&(n[u]=null),S.event.triggered=d,e.isPropagationStopped()&&f.addEventListener(d,xt),n[d](),e.isPropagationStopped()&&f.removeEventListener(d,xt),S.event.triggered=void 0,a&&(n[u]=a)),e.result}},simulate:function(e,t,n){var r=S.extend(new S.Event,n,{type:e,isSimulated:!0});S.event.trigger(r,null,t)}}),S.fn.extend({trigger:function(e,t){return this.each(function(){S.event.trigger(e,t,this)})},triggerHandler:function(e,t){var n=this[0];if(n)return S.event.trigger(e,t,n,!0)}}),y.focusin||S.each({focus:"focusin",blur:"focusout"},function(n,r){var i=function(e){S.event.simulate(r,e.target,S.event.fix(e))};S.event.special[r]={setup:function(){var e=this.ownerDocument||this.document||this,t=Y.access(e,r);t||e.addEventListener(n,i,!0),Y.access(e,r,(t||0)+1)},teardown:function(){var e=this.ownerDocument||this.document||this,t=Y.access(e,r)-1;t?Y.access(e,r,t):(e.removeEventListener(n,i,!0),Y.remove(e,r))}}});var bt=C.location,wt={guid:Date.now()},Tt=/\?/;S.parseXML=function(e){var t,n;if(!e||"string"!=typeof e)return null;try{t=(new C.DOMParser).parseFromString(e,"text/xml")}catch(e){}return n=t&&t.getElementsByTagName("parsererror")[0],t&&!n||S.error("Invalid XML: "+(n?S.map(n.childNodes,function(e){return e.textContent}).join("\n"):e)),t};var Ct=/\[\]$/,Et=/\r?\n/g,St=/^(?:submit|button|image|reset|file)$/i,kt=/^(?:input|select|textarea|keygen)/i;function At(n,e,r,i){var t;if(Array.isArray(e))S.each(e,function(e,t){r||Ct.test(n)?i(n,t):At(n+"["+("object"==typeof t&&null!=t?e:"")+"]",t,r,i)});else if(r||"object"!==w(e))i(n,e);else for(t in e)At(n+"["+t+"]",e[t],r,i)}S.param=function(e,t){var n,r=[],i=function(e,t){var n=m(t)?t():t;r[r.length]=encodeURIComponent(e)+"="+encodeURIComponent(null==n?"":n)};if(null==e)return"";if(Array.isArray(e)||e.jquery&&!S.isPlainObject(e))S.each(e,function(){i(this.name,this.value)});else for(n in e)At(n,e[n],t,i);return r.join("&")},S.fn.extend({serialize:function(){return S.param(this.serializeArray())},serializeArray:function(){return this.map(function(){var e=S.prop(this,"elements");return e?S.makeArray(e):this}).filter(function(){var e=this.type;return this.name&&!S(this).is(":disabled")&&kt.test(this.nodeName)&&!St.test(e)&&(this.checked||!pe.test(e))}).map(function(e,t){var n=S(this).val();return null==n?null:Array.isArray(n)?S.map(n,function(e){return{name:t.name,value:e.replace(Et,"\r\n")}}):{name:t.name,value:n.replace(Et,"\r\n")}}).get()}});var Nt=/%20/g,jt=/#.*$/,Dt=/([?&])_=[^&]*/,qt=/^(.*?):[ \t]*([^\r\n]*)$/gm,Lt=/^(?:GET|HEAD)$/,Ht=/^\/\//,Ot={},Pt={},Rt="*/".concat("*"),Mt=E.createElement("a");function It(o){return function(e,t){"string"!=typeof e&&(t=e,e="*");var n,r=0,i=e.toLowerCase().match(P)||[];if(m(t))while(n=i[r++])"+"===n[0]?(n=n.slice(1)||"*",(o[n]=o[n]||[]).unshift(t)):(o[n]=o[n]||[]).push(t)}}function Wt(t,i,o,a){var s={},u=t===Pt;function l(e){var r;return s[e]=!0,S.each(t[e]||[],function(e,t){var n=t(i,o,a);return"string"!=typeof n||u||s[n]?u?!(r=n):void 0:(i.dataTypes.unshift(n),l(n),!1)}),r}return l(i.dataTypes[0])||!s["*"]&&l("*")}function Ft(e,t){var n,r,i=S.ajaxSettings.flatOptions||{};for(n in t)void 0!==t[n]&&((i[n]?e:r||(r={}))[n]=t[n]);return r&&S.extend(!0,e,r),e}Mt.href=bt.href,S.extend({active:0,lastModified:{},etag:{},ajaxSettings:{url:bt.href,type:"GET",isLocal:/^(?:about|app|app-storage|.+-extension|file|res|widget):$/.test(bt.protocol),global:!0,processData:!0,async:!0,contentType:"application/x-www-form-urlencoded; charset=UTF-8",accepts:{"*":Rt,text:"text/plain",html:"text/html",xml:"application/xml, text/xml",json:"application/json, text/javascript"},contents:{xml:/\bxml\b/,html:/\bhtml/,json:/\bjson\b/},responseFields:{xml:"responseXML",text:"responseText",json:"responseJSON"},converters:{"* text":String,"text html":!0,"text json":JSON.parse,"text xml":S.parseXML},flatOptions:{url:!0,context:!0}},ajaxSetup:function(e,t){return t?Ft(Ft(e,S.ajaxSettings),t):Ft(S.ajaxSettings,e)},ajaxPrefilter:It(Ot),ajaxTransport:It(Pt),ajax:function(e,t){"object"==typeof e&&(t=e,e=void 0),t=t||{};var c,f,p,n,d,r,h,g,i,o,v=S.ajaxSetup({},t),y=v.context||v,m=v.context&&(y.nodeType||y.jquery)?S(y):S.event,x=S.Deferred(),b=S.Callbacks("once memory"),w=v.statusCode||{},a={},s={},u="canceled",T={readyState:0,getResponseHeader:function(e){var t;if(h){if(!n){n={};while(t=qt.exec(p))n[t[1].toLowerCase()+" "]=(n[t[1].toLowerCase()+" "]||[]).concat(t[2])}t=n[e.toLowerCase()+" "]}return null==t?null:t.join(", ")},getAllResponseHeaders:function(){return h?p:null},setRequestHeader:function(e,t){return null==h&&(e=s[e.toLowerCase()]=s[e.toLowerCase()]||e,a[e]=t),this},overrideMimeType:function(e){return null==h&&(v.mimeType=e),this},statusCode:function(e){var t;if(e)if(h)T.always(e[T.status]);else for(t in e)w[t]=[w[t],e[t]];return this},abort:function(e){var t=e||u;return c&&c.abort(t),l(0,t),this}};if(x.promise(T),v.url=((e||v.url||bt.href)+"").replace(Ht,bt.protocol+"//"),v.type=t.method||t.type||v.method||v.type,v.dataTypes=(v.dataType||"*").toLowerCase().match(P)||[""],null==v.crossDomain){r=E.createElement("a");try{r.href=v.url,r.href=r.href,v.crossDomain=Mt.protocol+"//"+Mt.host!=r.protocol+"//"+r.host}catch(e){v.crossDomain=!0}}if(v.data&&v.processData&&"string"!=typeof v.data&&(v.data=S.param(v.data,v.traditional)),Wt(Ot,v,t,T),h)return T;for(i in(g=S.event&&v.global)&&0==S.active++&&S.event.trigger("ajaxStart"),v.type=v.type.toUpperCase(),v.hasContent=!Lt.test(v.type),f=v.url.replace(jt,""),v.hasContent?v.data&&v.processData&&0===(v.contentType||"").indexOf("application/x-www-form-urlencoded")&&(v.data=v.data.replace(Nt,"+")):(o=v.url.slice(f.length),v.data&&(v.processData||"string"==typeof v.data)&&(f+=(Tt.test(f)?"&":"?")+v.data,delete v.data),!1===v.cache&&(f=f.replace(Dt,"$1"),o=(Tt.test(f)?"&":"?")+"_="+wt.guid+++o),v.url=f+o),v.ifModified&&(S.lastModified[f]&&T.setRequestHeader("If-Modified-Since",S.lastModified[f]),S.etag[f]&&T.setRequestHeader("If-None-Match",S.etag[f])),(v.data&&v.hasContent&&!1!==v.contentType||t.contentType)&&T.setRequestHeader("Content-Type",v.contentType),T.setRequestHeader("Accept",v.dataTypes[0]&&v.accepts[v.dataTypes[0]]?v.accepts[v.dataTypes[0]]+("*"!==v.dataTypes[0]?", "+Rt+"; q=0.01":""):v.accepts["*"]),v.headers)T.setRequestHeader(i,v.headers[i]);if(v.beforeSend&&(!1===v.beforeSend.call(y,T,v)||h))return T.abort();if(u="abort",b.add(v.complete),T.done(v.success),T.fail(v.error),c=Wt(Pt,v,t,T)){if(T.readyState=1,g&&m.trigger("ajaxSend",[T,v]),h)return T;v.async&&0<v.timeout&&(d=C.setTimeout(function(){T.abort("timeout")},v.timeout));try{h=!1,c.send(a,l)}catch(e){if(h)throw e;l(-1,e)}}else l(-1,"No Transport");function l(e,t,n,r){var i,o,a,s,u,l=t;h||(h=!0,d&&C.clearTimeout(d),c=void 0,p=r||"",T.readyState=0<e?4:0,i=200<=e&&e<300||304===e,n&&(s=function(e,t,n){var r,i,o,a,s=e.contents,u=e.dataTypes;while("*"===u[0])u.shift(),void 0===r&&(r=e.mimeType||t.getResponseHeader("Content-Type"));if(r)for(i in s)if(s[i]&&s[i].test(r)){u.unshift(i);break}if(u[0]in n)o=u[0];else{for(i in n){if(!u[0]||e.converters[i+" "+u[0]]){o=i;break}a||(a=i)}o=o||a}if(o)return o!==u[0]&&u.unshift(o),n[o]}(v,T,n)),!i&&-1<S.inArray("script",v.dataTypes)&&S.inArray("json",v.dataTypes)<0&&(v.converters["text script"]=function(){}),s=function(e,t,n,r){var i,o,a,s,u,l={},c=e.dataTypes.slice();if(c[1])for(a in e.converters)l[a.toLowerCase()]=e.converters[a];o=c.shift();while(o)if(e.responseFields[o]&&(n[e.responseFields[o]]=t),!u&&r&&e.dataFilter&&(t=e.dataFilter(t,e.dataType)),u=o,o=c.shift())if("*"===o)o=u;else if("*"!==u&&u!==o){if(!(a=l[u+" "+o]||l["* "+o]))for(i in l)if((s=i.split(" "))[1]===o&&(a=l[u+" "+s[0]]||l["* "+s[0]])){!0===a?a=l[i]:!0!==l[i]&&(o=s[0],c.unshift(s[1]));break}if(!0!==a)if(a&&e["throws"])t=a(t);else try{t=a(t)}catch(e){return{state:"parsererror",error:a?e:"No conversion from "+u+" to "+o}}}return{state:"success",data:t}}(v,s,T,i),i?(v.ifModified&&((u=T.getResponseHeader("Last-Modified"))&&(S.lastModified[f]=u),(u=T.getResponseHeader("etag"))&&(S.etag[f]=u)),204===e||"HEAD"===v.type?l="nocontent":304===e?l="notmodified":(l=s.state,o=s.data,i=!(a=s.error))):(a=l,!e&&l||(l="error",e<0&&(e=0))),T.status=e,T.statusText=(t||l)+"",i?x.resolveWith(y,[o,l,T]):x.rejectWith(y,[T,l,a]),T.statusCode(w),w=void 0,g&&m.trigger(i?"ajaxSuccess":"ajaxError",[T,v,i?o:a]),b.fireWith(y,[T,l]),g&&(m.trigger("ajaxComplete",[T,v]),--S.active||S.event.trigger("ajaxStop")))}return T},getJSON:function(e,t,n){return S.get(e,t,n,"json")},getScript:function(e,t){return S.get(e,void 0,t,"script")}}),S.each(["get","post"],function(e,i){S[i]=function(e,t,n,r){return m(t)&&(r=r||n,n=t,t=void 0),S.ajax(S.extend({url:e,type:i,dataType:r,data:t,success:n},S.isPlainObject(e)&&e))}}),S.ajaxPrefilter(function(e){var t;for(t in e.headers)"content-type"===t.toLowerCase()&&(e.contentType=e.headers[t]||"")}),S._evalUrl=function(e,t,n){return S.ajax({url:e,type:"GET",dataType:"script",cache:!0,async:!1,global:!1,converters:{"text script":function(){}},dataFilter:function(e){S.globalEval(e,t,n)}})},S.fn.extend({wrapAll:function(e){var t;return this[0]&&(m(e)&&(e=e.call(this[0])),t=S(e,this[0].ownerDocument).eq(0).clone(!0),this[0].parentNode&&t.insertBefore(this[0]),t.map(function(){var e=this;while(e.firstElementChild)e=e.firstElementChild;return e}).append(this)),this},wrapInner:function(n){return m(n)?this.each(function(e){S(this).wrapInner(n.call(this,e))}):this.each(function(){var e=S(this),t=e.contents();t.length?t.wrapAll(n):e.append(n)})},wrap:function(t){var n=m(t);return this.each(function(e){S(this).wrapAll(n?t.call(this,e):t)})},unwrap:function(e){return this.parent(e).not("body").each(function(){S(this).replaceWith(this.childNodes)}),this}}),S.expr.pseudos.hidden=function(e){return!S.expr.pseudos.visible(e)},S.expr.pseudos.visible=function(e){return!!(e.offsetWidth||e.offsetHeight||e.getClientRects().length)},S.ajaxSettings.xhr=function(){try{return new C.XMLHttpRequest}catch(e){}};var Bt={0:200,1223:204},$t=S.ajaxSettings.xhr();y.cors=!!$t&&"withCredentials"in $t,y.ajax=$t=!!$t,S.ajaxTransport(function(i){var o,a;if(y.cors||$t&&!i.crossDomain)return{send:function(e,t){var n,r=i.xhr();if(r.open(i.type,i.url,i.async,i.username,i.password),i.xhrFields)for(n in i.xhrFields)r[n]=i.xhrFields[n];for(n in i.mimeType&&r.overrideMimeType&&r.overrideMimeType(i.mimeType),i.crossDomain||e["X-Requested-With"]||(e["X-Requested-With"]="XMLHttpRequest"),e)r.setRequestHeader(n,e[n]);o=function(e){return function(){o&&(o=a=r.onload=r.onerror=r.onabort=r.ontimeout=r.onreadystatechange=null,"abort"===e?r.abort():"error"===e?"number"!=typeof r.status?t(0,"error"):t(r.status,r.statusText):t(Bt[r.status]||r.status,r.statusText,"text"!==(r.responseType||"text")||"string"!=typeof r.responseText?{binary:r.response}:{text:r.responseText},r.getAllResponseHeaders()))}},r.onload=o(),a=r.onerror=r.ontimeout=o("error"),void 0!==r.onabort?r.onabort=a:r.onreadystatechange=function(){4===r.readyState&&C.setTimeout(function(){o&&a()})},o=o("abort");try{r.send(i.hasContent&&i.data||null)}catch(e){if(o)throw e}},abort:function(){o&&o()}}}),S.ajaxPrefilter(function(e){e.crossDomain&&(e.contents.script=!1)}),S.ajaxSetup({accepts:{script:"text/javascript, application/javascript, application/ecmascript, application/x-ecmascript"},contents:{script:/\b(?:java|ecma)script\b/},converters:{"text script":function(e){return S.globalEval(e),e}}}),S.ajaxPrefilter("script",function(e){void 0===e.cache&&(e.cache=!1),e.crossDomain&&(e.type="GET")}),S.ajaxTransport("script",function(n){var r,i;if(n.crossDomain||n.scriptAttrs)return{send:function(e,t){r=S("<script>").attr(n.scriptAttrs||{}).prop({charset:n.scriptCharset,src:n.url}).on("load error",i=function(e){r.remove(),i=null,e&&t("error"===e.type?404:200,e.type)}),E.head.appendChild(r[0])},abort:function(){i&&i()}}});var _t,zt=[],Ut=/(=)\?(?=&|$)|\?\?/;S.ajaxSetup({jsonp:"callback",jsonpCallback:function(){var e=zt.pop()||S.expando+"_"+wt.guid++;return this[e]=!0,e}}),S.ajaxPrefilter("json jsonp",function(e,t,n){var r,i,o,a=!1!==e.jsonp&&(Ut.test(e.url)?"url":"string"==typeof e.data&&0===(e.contentType||"").indexOf("application/x-www-form-urlencoded")&&Ut.test(e.data)&&"data");if(a||"jsonp"===e.dataTypes[0])return r=e.jsonpCallback=m(e.jsonpCallback)?e.jsonpCallback():e.jsonpCallback,a?e[a]=e[a].replace(Ut,"$1"+r):!1!==e.jsonp&&(e.url+=(Tt.test(e.url)?"&":"?")+e.jsonp+"="+r),e.converters["script json"]=function(){return o||S.error(r+" was not called"),o[0]},e.dataTypes[0]="json",i=C[r],C[r]=function(){o=arguments},n.always(function(){void 0===i?S(C).removeProp(r):C[r]=i,e[r]&&(e.jsonpCallback=t.jsonpCallback,zt.push(r)),o&&m(i)&&i(o[0]),o=i=void 0}),"script"}),y.createHTMLDocument=((_t=E.implementation.createHTMLDocument("").body).innerHTML="<form></form><form></form>",2===_t.childNodes.length),S.parseHTML=function(e,t,n){return"string"!=typeof e?[]:("boolean"==typeof t&&(n=t,t=!1),t||(y.createHTMLDocument?((r=(t=E.implementation.createHTMLDocument("")).createElement("base")).href=E.location.href,t.head.appendChild(r)):t=E),o=!n&&[],(i=N.exec(e))?[t.createElement(i[1])]:(i=xe([e],t,o),o&&o.length&&S(o).remove(),S.merge([],i.childNodes)));var r,i,o},S.fn.load=function(e,t,n){var r,i,o,a=this,s=e.indexOf(" ");return-1<s&&(r=ht(e.slice(s)),e=e.slice(0,s)),m(t)?(n=t,t=void 0):t&&"object"==typeof t&&(i="POST"),0<a.length&&S.ajax({url:e,type:i||"GET",dataType:"html",data:t}).done(function(e){o=arguments,a.html(r?S("<div>").append(S.parseHTML(e)).find(r):e)}).always(n&&function(e,t){a.each(function(){n.apply(this,o||[e.responseText,t,e])})}),this},S.expr.pseudos.animated=function(t){return S.grep(S.timers,function(e){return t===e.elem}).length},S.offset={setOffset:function(e,t,n){var r,i,o,a,s,u,l=S.css(e,"position"),c=S(e),f={};"static"===l&&(e.style.position="relative"),s=c.offset(),o=S.css(e,"top"),u=S.css(e,"left"),("absolute"===l||"fixed"===l)&&-1<(o+u).indexOf("auto")?(a=(r=c.position()).top,i=r.left):(a=parseFloat(o)||0,i=parseFloat(u)||0),m(t)&&(t=t.call(e,n,S.extend({},s))),null!=t.top&&(f.top=t.top-s.top+a),null!=t.left&&(f.left=t.left-s.left+i),"using"in t?t.using.call(e,f):c.css(f)}},S.fn.extend({offset:function(t){if(arguments.length)return void 0===t?this:this.each(function(e){S.offset.setOffset(this,t,e)});var e,n,r=this[0];return r?r.getClientRects().length?(e=r.getBoundingClientRect(),n=r.ownerDocument.defaultView,{top:e.top+n.pageYOffset,left:e.left+n.pageXOffset}):{top:0,left:0}:void 0},position:function(){if(this[0]){var e,t,n,r=this[0],i={top:0,left:0};if("fixed"===S.css(r,"position"))t=r.getBoundingClientRect();else{t=this.offset(),n=r.ownerDocument,e=r.offsetParent||n.documentElement;while(e&&(e===n.body||e===n.documentElement)&&"static"===S.css(e,"position"))e=e.parentNode;e&&e!==r&&1===e.nodeType&&((i=S(e).offset()).top+=S.css(e,"borderTopWidth",!0),i.left+=S.css(e,"borderLeftWidth",!0))}return{top:t.top-i.top-S.css(r,"marginTop",!0),left:t.left-i.left-S.css(r,"marginLeft",!0)}}},offsetParent:function(){return this.map(function(){var e=this.offsetParent;while(e&&"static"===S.css(e,"position"))e=e.offsetParent;return e||re})}}),S.each({scrollLeft:"pageXOffset",scrollTop:"pageYOffset"},function(t,i){var o="pageYOffset"===i;S.fn[t]=function(e){return $(this,function(e,t,n){var r;if(x(e)?r=e:9===e.nodeType&&(r=e.defaultView),void 0===n)return r?r[i]:e[t];r?r.scrollTo(o?r.pageXOffset:n,o?n:r.pageYOffset):e[t]=n},t,e,arguments.length)}}),S.each(["top","left"],function(e,n){S.cssHooks[n]=Fe(y.pixelPosition,function(e,t){if(t)return t=We(e,n),Pe.test(t)?S(e).position()[n]+"px":t})}),S.each({Height:"height",Width:"width"},function(a,s){S.each({padding:"inner"+a,content:s,"":"outer"+a},function(r,o){S.fn[o]=function(e,t){var n=arguments.length&&(r||"boolean"!=typeof e),i=r||(!0===e||!0===t?"margin":"border");return $(this,function(e,t,n){var r;return x(e)?0===o.indexOf("outer")?e["inner"+a]:e.document.documentElement["client"+a]:9===e.nodeType?(r=e.documentElement,Math.max(e.body["scroll"+a],r["scroll"+a],e.body["offset"+a],r["offset"+a],r["client"+a])):void 0===n?S.css(e,t,i):S.style(e,t,n,i)},s,n?e:void 0,n)}})}),S.each(["ajaxStart","ajaxStop","ajaxComplete","ajaxError","ajaxSuccess","ajaxSend"],function(e,t){S.fn[t]=function(e){return this.on(t,e)}}),S.fn.extend({bind:function(e,t,n){return this.on(e,null,t,n)},unbind:function(e,t){return this.off(e,null,t)},delegate:function(e,t,n,r){return this.on(t,e,n,r)},undelegate:function(e,t,n){return 1===arguments.length?this.off(e,"**"):this.off(t,e||"**",n)},hover:function(e,t){return this.mouseenter(e).mouseleave(t||e)}}),S.each("blur focus focusin focusout resize scroll click dblclick mousedown mouseup mousemove mouseover mouseout mouseenter mouseleave change select submit keydown keypress keyup contextmenu".split(" "),function(e,n){S.fn[n]=function(e,t){return 0<arguments.length?this.on(n,null,e,t):this.trigger(n)}});var Xt=/^[\s\uFEFF\xA0]+|[\s\uFEFF\xA0]+$/g;S.proxy=function(e,t){var n,r,i;if("string"==typeof t&&(n=e[t],t=e,e=n),m(e))return r=s.call(arguments,2),(i=function(){return e.apply(t||this,r.concat(s.call(arguments)))}).guid=e.guid=e.guid||S.guid++,i},S.holdReady=function(e){e?S.readyWait++:S.ready(!0)},S.isArray=Array.isArray,S.parseJSON=JSON.parse,S.nodeName=A,S.isFunction=m,S.isWindow=x,S.camelCase=X,S.type=w,S.now=Date.now,S.isNumeric=function(e){var t=S.type(e);return("number"===t||"string"===t)&&!isNaN(e-parseFloat(e))},S.trim=function(e){return null==e?"":(e+"").replace(Xt,"")},"function"==typeof define&&define.amd&&define("jquery",[],function(){return S});var Vt=C.jQuery,Gt=C.$;return S.noConflict=function(e){return C.$===S&&(C.$=Gt),e&&C.jQuery===S&&(C.jQuery=Vt),S},"undefined"==typeof e&&(C.jQuery=C.$=S),S});
diff --git a/js/theme.js b/js/theme.js
new file mode 100644
index 00000000..9299d964
--- /dev/null
+++ b/js/theme.js
@@ -0,0 +1,2 @@
+/* sphinx_rtd_theme version 1.0.0 | MIT license */
+!function(n){var e={};function t(i){if(e[i])return e[i].exports;var o=e[i]={i:i,l:!1,exports:{}};return n[i].call(o.exports,o,o.exports,t),o.l=!0,o.exports}t.m=n,t.c=e,t.d=function(n,e,i){t.o(n,e)||Object.defineProperty(n,e,{enumerable:!0,get:i})},t.r=function(n){"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(n,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(n,"__esModule",{value:!0})},t.t=function(n,e){if(1&e&&(n=t(n)),8&e)return n;if(4&e&&"object"==typeof n&&n&&n.__esModule)return n;var i=Object.create(null);if(t.r(i),Object.defineProperty(i,"default",{enumerable:!0,value:n}),2&e&&"string"!=typeof n)for(var o in n)t.d(i,o,function(e){return n[e]}.bind(null,o));return i},t.n=function(n){var e=n&&n.__esModule?function(){return n.default}:function(){return n};return t.d(e,"a",e),e},t.o=function(n,e){return Object.prototype.hasOwnProperty.call(n,e)},t.p="",t(t.s=0)}([function(n,e,t){t(1),n.exports=t(3)},function(n,e,t){(function(){var e="undefined"!=typeof window?window.jQuery:t(2);n.exports.ThemeNav={navBar:null,win:null,winScroll:!1,winResize:!1,linkScroll:!1,winPosition:0,winHeight:null,docHeight:null,isRunning:!1,enable:function(n){var t=this;void 0===n&&(n=!0),t.isRunning||(t.isRunning=!0,e((function(e){t.init(e),t.reset(),t.win.on("hashchange",t.reset),n&&t.win.on("scroll",(function(){t.linkScroll||t.winScroll||(t.winScroll=!0,requestAnimationFrame((function(){t.onScroll()})))})),t.win.on("resize",(function(){t.winResize||(t.winResize=!0,requestAnimationFrame((function(){t.onResize()})))})),t.onResize()})))},enableSticky:function(){this.enable(!0)},init:function(n){n(document);var e=this;this.navBar=n("div.wy-side-scroll:first"),this.win=n(window),n(document).on("click","[data-toggle='wy-nav-top']",(function(){n("[data-toggle='wy-nav-shift']").toggleClass("shift"),n("[data-toggle='rst-versions']").toggleClass("shift")})).on("click",".wy-menu-vertical .current ul li a",(function(){var t=n(this);n("[data-toggle='wy-nav-shift']").removeClass("shift"),n("[data-toggle='rst-versions']").toggleClass("shift"),e.toggleCurrent(t),e.hashChange()})).on("click","[data-toggle='rst-current-version']",(function(){n("[data-toggle='rst-versions']").toggleClass("shift-up")})),n("table.docutils:not(.field-list,.footnote,.citation)").wrap("<div class='wy-table-responsive'></div>"),n("table.docutils.footnote").wrap("<div class='wy-table-responsive footnote'></div>"),n("table.docutils.citation").wrap("<div class='wy-table-responsive citation'></div>"),n(".wy-menu-vertical ul").not(".simple").siblings("a").each((function(){var t=n(this);expand=n('<button class="toctree-expand" title="Open/close menu"></button>'),expand.on("click",(function(n){return e.toggleCurrent(t),n.stopPropagation(),!1})),t.prepend(expand)}))},reset:function(){var n=encodeURI(window.location.hash)||"#";try{var e=$(".wy-menu-vertical"),t=e.find('[href="'+n+'"]');if(0===t.length){var i=$('.document [id="'+n.substring(1)+'"]').closest("div.section");0===(t=e.find('[href="#'+i.attr("id")+'"]')).length&&(t=e.find('[href="#"]'))}if(t.length>0){$(".wy-menu-vertical .current").removeClass("current").attr("aria-expanded","false"),t.addClass("current").attr("aria-expanded","true"),t.closest("li.toctree-l1").parent().addClass("current").attr("aria-expanded","true");for(let n=1;n<=10;n++)t.closest("li.toctree-l"+n).addClass("current").attr("aria-expanded","true");t[0].scrollIntoView()}}catch(n){console.log("Error expanding nav for anchor",n)}},onScroll:function(){this.winScroll=!1;var n=this.win.scrollTop(),e=n+this.winHeight,t=this.navBar.scrollTop()+(n-this.winPosition);n<0||e>this.docHeight||(this.navBar.scrollTop(t),this.winPosition=n)},onResize:function(){this.winResize=!1,this.winHeight=this.win.height(),this.docHeight=$(document).height()},hashChange:function(){this.linkScroll=!0,this.win.one("hashchange",(function(){this.linkScroll=!1}))},toggleCurrent:function(n){var e=n.closest("li");e.siblings("li.current").removeClass("current").attr("aria-expanded","false"),e.siblings().find("li.current").removeClass("current").attr("aria-expanded","false");var t=e.find("> ul li");t.length&&(t.removeClass("current").attr("aria-expanded","false"),e.toggleClass("current").attr("aria-expanded",(function(n,e){return"true"==e?"false":"true"})))}},"undefined"!=typeof window&&(window.SphinxRtdTheme={Navigation:n.exports.ThemeNav,StickyNav:n.exports.ThemeNav}),function(){for(var n=0,e=["ms","moz","webkit","o"],t=0;t<e.length&&!window.requestAnimationFrame;++t)window.requestAnimationFrame=window[e[t]+"RequestAnimationFrame"],window.cancelAnimationFrame=window[e[t]+"CancelAnimationFrame"]||window[e[t]+"CancelRequestAnimationFrame"];window.requestAnimationFrame||(window.requestAnimationFrame=function(e,t){var i=(new Date).getTime(),o=Math.max(0,16-(i-n)),r=window.setTimeout((function(){e(i+o)}),o);return n=i+o,r}),window.cancelAnimationFrame||(window.cancelAnimationFrame=function(n){clearTimeout(n)})}()}).call(window)},function(n,e){n.exports=jQuery},function(n,e,t){}]);
diff --git a/js/theme_extra.js b/js/theme_extra.js
new file mode 100644
index 00000000..d103ed6f
--- /dev/null
+++ b/js/theme_extra.js
@@ -0,0 +1,8 @@
+/*
+ * Assign 'docutils' class to tables so styling and
+ * JavaScript behavior is applied.
+ *
+ * https://github.com/mkdocs/mkdocs/issues/2028
+ */
+
+$('div.rst-content table').addClass('docutils');
diff --git a/lasx/bitwise_operations/index.html b/lasx/bitwise_operations/index.html
new file mode 100644
index 00000000..3eebca47
--- /dev/null
+++ b/lasx/bitwise_operations/index.html
@@ -0,0 +1,2289 @@
+<!DOCTYPE html>
+<html class="writer-html5" lang="en" >
+<head>
+    <meta charset="utf-8" />
+    <meta http-equiv="X-UA-Compatible" content="IE=edge" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" /><link rel="canonical" href="https://jia.je/unofficial-loongarch-intrinsics-guide/lasx/bitwise_operations/" />
+      <link rel="shortcut icon" href="../../img/favicon.ico" />
+    <title>Bitwise Operations - Unofficial LoongArch Intrinsics Guide</title>
+    <link rel="stylesheet" href="../../css/theme.css" />
+    <link rel="stylesheet" href="../../css/theme_extra.css" />
+        <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.8.0/styles/github.min.css" />
+        <link href="../../main.css" rel="stylesheet" />
+    
+      <script>
+        // Current page data
+        var mkdocs_page_name = "Bitwise Operations";
+        var mkdocs_page_input_path = "lasx/bitwise_operations.md";
+        var mkdocs_page_url = "/unofficial-loongarch-intrinsics-guide/lasx/bitwise_operations/";
+      </script>
+    
+    <!--[if lt IE 9]>
+      <script src="../../js/html5shiv.min.js"></script>
+    <![endif]-->
+      <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.8.0/highlight.min.js"></script>
+      <script>hljs.highlightAll();</script> 
+</head>
+
+<body class="wy-body-for-nav" role="document">
+
+  <div class="wy-grid-for-nav">
+    <nav data-toggle="wy-nav-shift" class="wy-nav-side stickynav">
+    <div class="wy-side-scroll">
+      <div class="wy-side-nav-search">
+          <a href="../.." class="icon icon-home"> Unofficial LoongArch Intrinsics Guide
+        </a>
+      </div>
+
+      <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../..">Unofficial LoongArch Intrinsics Guide</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../latency_throughput/">Latency and Throughput of Instructions</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../migrating_avx/">Migrating from AVX to LASX</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../migrating_sse/">Migrating from SSE to LSX</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../viewer/">Browse All Intrinsics</a>
+                </li>
+              </ul>
+              <p class="caption"><span class="caption-text">Lasx</span></p>
+              <ul class="current">
+                  <li class="toctree-l1 current"><a class="reference internal current" href="./">Bitwise Operations</a>
+    <ul class="current">
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvbitsel_v-__m256i-a-__m256i-b-__m256i-c">__m256i __lasx_xvbitsel_v (__m256i a, __m256i b, __m256i c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvbitseli_b-__m256i-a-__m256i-b-imm0_255-imm">__m256i __lasx_xvbitseli_b (__m256i a, __m256i b, imm0_255 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_1">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_1">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_1">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_1">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_1">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvbitclr_b-__m256i-a-__m256i-b">__m256i __lasx_xvbitclr_b (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_2">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_2">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_2">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_2">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_2">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvbitclr_h-__m256i-a-__m256i-b">__m256i __lasx_xvbitclr_h (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_3">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_3">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_3">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_3">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_3">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvbitclr_w-__m256i-a-__m256i-b">__m256i __lasx_xvbitclr_w (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_4">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_4">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_4">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_4">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_4">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvbitclr_d-__m256i-a-__m256i-b">__m256i __lasx_xvbitclr_d (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_5">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_5">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_5">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_5">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_5">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvbitclri_b-__m256i-a-imm0_7-imm">__m256i __lasx_xvbitclri_b (__m256i a, imm0_7 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_6">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_6">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_6">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_6">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_6">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvbitclri_h-__m256i-a-imm0_15-imm">__m256i __lasx_xvbitclri_h (__m256i a, imm0_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_7">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_7">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_7">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_7">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_7">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvbitclri_w-__m256i-a-imm0_31-imm">__m256i __lasx_xvbitclri_w (__m256i a, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_8">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_8">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_8">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_8">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_8">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvbitclri_d-__m256i-a-imm0_63-imm">__m256i __lasx_xvbitclri_d (__m256i a, imm0_63 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_9">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_9">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_9">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_9">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_9">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvbitset_b-__m256i-a-__m256i-b">__m256i __lasx_xvbitset_b (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_10">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_10">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_10">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_10">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_10">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvbitset_h-__m256i-a-__m256i-b">__m256i __lasx_xvbitset_h (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_11">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_11">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_11">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_11">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_11">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvbitset_w-__m256i-a-__m256i-b">__m256i __lasx_xvbitset_w (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_12">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_12">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_12">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_12">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_12">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvbitset_d-__m256i-a-__m256i-b">__m256i __lasx_xvbitset_d (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_13">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_13">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_13">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_13">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_13">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvbitseti_b-__m256i-a-imm0_7-imm">__m256i __lasx_xvbitseti_b (__m256i a, imm0_7 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_14">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_14">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_14">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_14">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_14">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvbitseti_h-__m256i-a-imm0_15-imm">__m256i __lasx_xvbitseti_h (__m256i a, imm0_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_15">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_15">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_15">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_15">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_15">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvbitseti_w-__m256i-a-imm0_31-imm">__m256i __lasx_xvbitseti_w (__m256i a, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_16">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_16">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_16">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_16">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_16">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvbitseti_d-__m256i-a-imm0_63-imm">__m256i __lasx_xvbitseti_d (__m256i a, imm0_63 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_17">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_17">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_17">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_17">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_17">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvbitrev_b-__m256i-a-__m256i-b">__m256i __lasx_xvbitrev_b (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_18">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_18">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_18">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_18">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_18">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvbitrev_h-__m256i-a-__m256i-b">__m256i __lasx_xvbitrev_h (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_19">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_19">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_19">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_19">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_19">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvbitrev_w-__m256i-a-__m256i-b">__m256i __lasx_xvbitrev_w (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_20">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_20">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_20">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_20">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_20">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvbitrev_d-__m256i-a-__m256i-b">__m256i __lasx_xvbitrev_d (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_21">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_21">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_21">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_21">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_21">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvbitrevi_b-__m256i-a-imm0_7-imm">__m256i __lasx_xvbitrevi_b (__m256i a, imm0_7 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_22">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_22">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_22">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_22">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_22">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvbitrevi_h-__m256i-a-imm0_15-imm">__m256i __lasx_xvbitrevi_h (__m256i a, imm0_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_23">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_23">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_23">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_23">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_23">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvbitrevi_w-__m256i-a-imm0_31-imm">__m256i __lasx_xvbitrevi_w (__m256i a, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_24">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_24">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_24">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_24">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_24">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvbitrevi_d-__m256i-a-imm0_63-imm">__m256i __lasx_xvbitrevi_d (__m256i a, imm0_63 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_25">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_25">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_25">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_25">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_25">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvclo_b-__m256i-a">__m256i __lasx_xvclo_b (__m256i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_26">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_26">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_26">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_26">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_26">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvclo_h-__m256i-a">__m256i __lasx_xvclo_h (__m256i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_27">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_27">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_27">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_27">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_27">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvclo_w-__m256i-a">__m256i __lasx_xvclo_w (__m256i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_28">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_28">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_28">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_28">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_28">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvclo_d-__m256i-a">__m256i __lasx_xvclo_d (__m256i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_29">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_29">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_29">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_29">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_29">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvclz_b-__m256i-a">__m256i __lasx_xvclz_b (__m256i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_30">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_30">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_30">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_30">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_30">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvclz_h-__m256i-a">__m256i __lasx_xvclz_h (__m256i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_31">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_31">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_31">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_31">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_31">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvclz_w-__m256i-a">__m256i __lasx_xvclz_w (__m256i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_32">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_32">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_32">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_32">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_32">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvclz_d-__m256i-a">__m256i __lasx_xvclz_d (__m256i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_33">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_33">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_33">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_33">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_33">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvpcnt_b-__m256i-a">__m256i __lasx_xvpcnt_b (__m256i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_34">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_34">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_34">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_34">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_34">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvpcnt_h-__m256i-a">__m256i __lasx_xvpcnt_h (__m256i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_35">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_35">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_35">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_35">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_35">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvpcnt_w-__m256i-a">__m256i __lasx_xvpcnt_w (__m256i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_36">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_36">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_36">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_36">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_36">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvpcnt_d-__m256i-a">__m256i __lasx_xvpcnt_d (__m256i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_37">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_37">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_37">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_37">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_37">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    </ul>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../branch/">Branch</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_comparison/">Floating Point Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_computation/">Floating Point Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_conversion/">Floating Point Conversion</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_misc/">Floating Point Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../fma/">Fused Multiply-Add</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../integer_comparison/">Integer Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../integer_computation/">Integer Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../logical/">Logical</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../memory/">Memory Load & Store</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../misc/">Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../permutation/">Permutation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../shift/">Shift</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../shuffling/">Shuffling</a>
+                  </li>
+              </ul>
+              <p class="caption"><span class="caption-text">Lsx</span></p>
+              <ul>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/bitwise_operations/">Bitwise Operations</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/branch/">Branch</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/float_comparison/">Floating Point Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/float_computation/">Floating Point Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/float_conversion/">Floating Point Conversion</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/float_misc/">Floating Point Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/fma/">Fused Multiply-Add</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/integer_comparison/">Integer Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/integer_computation/">Integer Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/logical/">Logical</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/memory/">Memory Load & Store</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/misc/">Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/permutation/">Permutation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/shift/">Shift</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/shuffling/">Shuffling</a>
+                  </li>
+              </ul>
+      </div>
+    </div>
+    </nav>
+
+    <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
+      <nav class="wy-nav-top" role="navigation" aria-label="Mobile navigation menu">
+          <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
+          <a href="../..">Unofficial LoongArch Intrinsics Guide</a>
+        
+      </nav>
+      <div class="wy-nav-content">
+        <div class="rst-content"><div role="navigation" aria-label="breadcrumbs navigation">
+  <ul class="wy-breadcrumbs">
+    <li><a href="../.." class="icon icon-home" aria-label="Docs"></a></li>
+          <li class="breadcrumb-item">Lasx</li>
+      <li class="breadcrumb-item active">Bitwise Operations</li>
+    <li class="wy-breadcrumbs-aside">
+    </li>
+  </ul>
+  <hr/>
+</div>
+          <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
+            <div class="section" itemprop="articleBody">
+              
+                <h1 id="bitwise-operations">Bitwise Operations</h1>
+<h2 id="__m256i-__lasx_xvbitsel_v-__m256i-a-__m256i-b-__m256i-c">__m256i __lasx_xvbitsel_v (__m256i a, __m256i b, __m256i c)</h2>
+<h3 id="synopsis">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvbitsel_v (__m256i a, __m256i b, __m256i c)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvbitsel.v xr, xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description">Description</h3>
+<p>Compute bitwise selection: for each bit position, if the bit in <code>c</code> equals to one, copy the bit from <code>b</code> to <code>dst</code>, otherwise copy from <code>a</code>.</p>
+<h3 id="examples">Examples</h3>
+<pre><code class="language-c++">__m256i __lasx_xvbitsel_v(__m256i{0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{0xabababababababab, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678}, __m256i{0xffff0000aaaabbbb, 0x1111222233334444, 0x00000000ffffffff, 0xffffffff00000000})
+= 0xabab3344ffeeefab 0x98ba9beccfedfb00 0xabcdef1243214321 0x56785678ddeeddee
+</code></pre>
+<h3 id="operation">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = (c.dword[i] &amp; b.dword[i]) | (~c.dword[i] &amp; a.dword[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvbitseli_b-__m256i-a-__m256i-b-imm0_255-imm">__m256i __lasx_xvbitseli_b (__m256i a, __m256i b, imm0_255 imm)</h2>
+<h3 id="synopsis_1">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvbitseli_b (__m256i a, __m256i b, imm0_255 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvbitseli.b xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_1">Description</h3>
+<p>Compute bitwise selection: for each bit position, if the bit in <code>a</code> equals to one, copy the bit from <code>imm</code> to <code>dst</code>, otherwise copy from <code>b</code>.</p>
+<h3 id="examples_1">Examples</h3>
+<pre><code class="language-c++">__m256i __lasx_xvbitseli_b( __m256i{ 0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{ 0xabababababababab, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678}, 0x12)
+= 0xba8b9aabba8b9a23 0x1216123012031221 0x1230123653115311 0x5652565212121212
+</code></pre>
+<h3 id="operation_1">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 32; i++) {
+  dst.byte[i] = (~a.byte[i] &amp; b.byte[i]) | (a.byte[i] &amp; (u8)imm);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_1">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvbitclr_b-__m256i-a-__m256i-b">__m256i __lasx_xvbitclr_b (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_2">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvbitclr_b (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvbitclr.b xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_2">Description</h3>
+<p>Clear the bit specified by elements in <code>b</code> from 8-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>
+<h3 id="examples_2">Examples</h3>
+<pre><code class="language-c++">__m256i __lasx_xvbitclr_b(__m256i{0xffffffffffffffff, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{0xabababababababab, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678})
+= 0xf7f7f7f7f7f7f7f7 0x99aabbccd5ecf700 0xabcdeb0212341234 0xaabaaaba9dee9dee
+</code></pre>
+<h3 id="operation_2">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 32; i++) {
+  dst.byte[i] = a.byte[i] &amp; (~((u8)1 &lt;&lt; (b.byte[i] % 8)));
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_2">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvbitclr_h-__m256i-a-__m256i-b">__m256i __lasx_xvbitclr_h (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_3">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvbitclr_h (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvbitclr.h xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_3">Description</h3>
+<p>Clear the bit specified by elements in <code>b</code> from 16-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>
+<h3 id="examples_3">Examples</h3>
+<pre><code class="language-c++">__m256i __lasx_xvbitclr_h(__m256i{0xffffffffffffffff, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{0xabababababababab, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678})
+= 0xf7fff7fff7fff7ff 0x99aabbccddecff00 0xabcdef0212341234 0xaabbaabbdceedcee
+</code></pre>
+<h3 id="operation_3">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = a.half[i] &amp; (~((u16)1 &lt;&lt; (b.half[i] % 16)));
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_3">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvbitclr_w-__m256i-a-__m256i-b">__m256i __lasx_xvbitclr_w (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_4">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvbitclr_w (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvbitclr.w xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_4">Description</h3>
+<p>Clear the bit specified by elements in <code>b</code> from 32-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>
+<h3 id="examples_4">Examples</h3>
+<pre><code class="language-c++">__m256i __lasx_xvbitclr_w(__m256i{0xffffffffffffffff, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{0xabababababababab, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678})
+= 0xfffff7fffffff7ff 0x99aabbccddeeff00 0xabcdef1212341234 0xaabbaabbdceeddee
+</code></pre>
+<h3 id="operation_4">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = a.word[i] &amp; (~((u32)1 &lt;&lt; (b.word[i] % 32)));
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_4">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvbitclr_d-__m256i-a-__m256i-b">__m256i __lasx_xvbitclr_d (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_5">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvbitclr_d (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvbitclr.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_5">Description</h3>
+<p>Clear the bit specified by elements in <code>b</code> from 64-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>
+<h3 id="examples_5">Examples</h3>
+<pre><code class="language-c++">__m256i __lasx_xvbitclr_d(__m256i{0xffffffffffffffff, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{0xabababababababab, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678})
+= 0xfffff7ffffffffff 0x99aabbccddeeff00 0xabcdef1012341234 0xaabbaabbddeeddee
+</code></pre>
+<h3 id="operation_5">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = a.dword[i] &amp; (~((u64)1 &lt;&lt; (b.dword[i] % 64)));
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_5">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvbitclri_b-__m256i-a-imm0_7-imm">__m256i __lasx_xvbitclri_b (__m256i a, imm0_7 imm)</h2>
+<h3 id="synopsis_6">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvbitclri_b (__m256i a, imm0_7 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvbitclri.b xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_6">Description</h3>
+<p>Clear the bit specified by <code>imm</code> from 8-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>
+<h3 id="examples_6">Examples</h3>
+<pre><code class="language-c++">__m256i __lasx_xvbitclri_b( __m256i{ 0xffffffffffffffff, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, 1)
+= 0xfdfdfdfdfdfdfdfd 0x99a8b9ccddecfd00 0xa9cded1010341034 0xa8b9a8b9ddecddec
+</code></pre>
+<h3 id="operation_6">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 32; i++) {
+  dst.byte[i] = a.byte[i] &amp; (~((u8)1 &lt;&lt; imm));
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_6">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvbitclri_h-__m256i-a-imm0_15-imm">__m256i __lasx_xvbitclri_h (__m256i a, imm0_15 imm)</h2>
+<h3 id="synopsis_7">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvbitclri_h (__m256i a, imm0_15 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvbitclri.h xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_7">Description</h3>
+<p>Clear the bit specified by <code>imm</code> from 16-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>
+<h3 id="examples_7">Examples</h3>
+<pre><code class="language-c++">__m256i __lasx_xvbitclri_h( __m256i{ 0xffffffffffffffff, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, 1)
+= 0xfffdfffdfffdfffd 0x99a8bbccddecff00 0xabcdef1012341234 0xaab9aab9ddecddec
+</code></pre>
+<h3 id="operation_7">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = a.half[i] &amp; (~((u16)1 &lt;&lt; imm));
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_7">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvbitclri_w-__m256i-a-imm0_31-imm">__m256i __lasx_xvbitclri_w (__m256i a, imm0_31 imm)</h2>
+<h3 id="synopsis_8">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvbitclri_w (__m256i a, imm0_31 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvbitclri.w xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_8">Description</h3>
+<p>Clear the bit specified by <code>imm</code> from 32-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>
+<h3 id="examples_8">Examples</h3>
+<pre><code class="language-c++">__m256i __lasx_xvbitclri_w( __m256i{ 0xffffffffffffffff, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, 1)
+= 0xfffffffdfffffffd 0x99aabbccddeeff00 0xabcdef1012341234 0xaabbaab9ddeeddec
+</code></pre>
+<h3 id="operation_8">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = a.word[i] &amp; (~((u32)1 &lt;&lt; imm));
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_8">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvbitclri_d-__m256i-a-imm0_63-imm">__m256i __lasx_xvbitclri_d (__m256i a, imm0_63 imm)</h2>
+<h3 id="synopsis_9">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvbitclri_d (__m256i a, imm0_63 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvbitclri.d xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_9">Description</h3>
+<p>Clear the bit specified by <code>imm</code> from 64-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>
+<h3 id="examples_9">Examples</h3>
+<pre><code class="language-c++">__m256i __lasx_xvbitclri_d( __m256i{ 0xffffffffffffffff, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, 1)
+= 0xfffffffffffffffd 0x99aabbccddeeff00 0xabcdef1212341234 0xaabbaabbddeeddec
+</code></pre>
+<h3 id="operation_9">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = a.dword[i] &amp; (~((u64)1 &lt;&lt; imm));
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_9">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvbitset_b-__m256i-a-__m256i-b">__m256i __lasx_xvbitset_b (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_10">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvbitset_b (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvbitset.b xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_10">Description</h3>
+<p>Set the bit specified by elements in <code>b</code> from 8-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>
+<h3 id="examples_10">Examples</h3>
+<pre><code class="language-c++">__m256i __lasx_xvbitset_b(__m256i{0x0000000000000000, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{0xabababababababab, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678})
+= 0x0808080808080808 0x9dbabfdcddeeff02 0xafddef121a361a36 0xeabbeabbddefddef
+</code></pre>
+<h3 id="operation_10">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 32; i++) {
+  dst.byte[i] = a.byte[i] | ((u8)1 &lt;&lt; (b.byte[i] % 8));
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_10">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvbitset_h-__m256i-a-__m256i-b">__m256i __lasx_xvbitset_h (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_11">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvbitset_h (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvbitset.h xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_11">Description</h3>
+<p>Set the bit specified by elements in <code>b</code> from 16-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>
+<h3 id="examples_11">Examples</h3>
+<pre><code class="language-c++">__m256i __lasx_xvbitset_h(__m256i{0x0000000000000000, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{0xabababababababab, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678})
+= 0x0800080008000800 0x99babbdcddeeff02 0xabddef1212361236 0xabbbabbbddeeddee
+</code></pre>
+<h3 id="operation_11">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = a.half[i] | ((u16)1 &lt;&lt; (b.half[i] % 16));
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_11">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvbitset_w-__m256i-a-__m256i-b">__m256i __lasx_xvbitset_w (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_12">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvbitset_w (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvbitset.w xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_12">Description</h3>
+<p>Set the bit specified by elements in <code>b</code> from 32-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>
+<h3 id="examples_12">Examples</h3>
+<pre><code class="language-c++">__m256i __lasx_xvbitset_w(__m256i{0x0000000000000000, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{0xabababababababab, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678})
+= 0x0000080000000800 0x99babbccddeeff02 0xabddef1212341236 0xabbbaabbddeeddee
+</code></pre>
+<h3 id="operation_12">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = a.word[i] | ((u32)1 &lt;&lt; (b.word[i] % 32));
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_12">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvbitset_d-__m256i-a-__m256i-b">__m256i __lasx_xvbitset_d (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_13">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvbitset_d (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvbitset.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_13">Description</h3>
+<p>Set the bit specified by elements in <code>b</code> from 64-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>
+<h3 id="examples_13">Examples</h3>
+<pre><code class="language-c++">__m256i __lasx_xvbitset_d(__m256i{0x0000000000000000, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{0xabababababababab, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678})
+= 0x0000080000000000 0x99aabbceddeeff00 0xabcdef1212341234 0xabbbaabbddeeddee
+</code></pre>
+<h3 id="operation_13">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = a.dword[i] | ((u64)1 &lt;&lt; (b.dword[i] % 64));
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_13">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvbitseti_b-__m256i-a-imm0_7-imm">__m256i __lasx_xvbitseti_b (__m256i a, imm0_7 imm)</h2>
+<h3 id="synopsis_14">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvbitseti_b (__m256i a, imm0_7 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvbitseti.b xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_14">Description</h3>
+<p>Set the bit specified by <code>imm</code> from 8-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>
+<h3 id="examples_14">Examples</h3>
+<pre><code class="language-c++">__m256i __lasx_xvbitseti_b( __m256i{ 0x0000000000000000, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, 1)
+= 0x0202020202020202 0x9baabbcedfeeff02 0xabcfef1212361236 0xaabbaabbdfeedfee
+</code></pre>
+<h3 id="operation_14">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 32; i++) {
+  dst.byte[i] = a.byte[i] | ((u8)1 &lt;&lt; imm);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_14">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvbitseti_h-__m256i-a-imm0_15-imm">__m256i __lasx_xvbitseti_h (__m256i a, imm0_15 imm)</h2>
+<h3 id="synopsis_15">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvbitseti_h (__m256i a, imm0_15 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvbitseti.h xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_15">Description</h3>
+<p>Set the bit specified by <code>imm</code> from 16-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>
+<h3 id="examples_15">Examples</h3>
+<pre><code class="language-c++">__m256i __lasx_xvbitseti_h( __m256i{ 0x0000000000000000, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, 1)
+= 0x0002000200020002 0x99aabbceddeeff02 0xabcfef1212361236 0xaabbaabbddeeddee
+</code></pre>
+<h3 id="operation_15">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = a.half[i] | ((u16)1 &lt;&lt; imm);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_15">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvbitseti_w-__m256i-a-imm0_31-imm">__m256i __lasx_xvbitseti_w (__m256i a, imm0_31 imm)</h2>
+<h3 id="synopsis_16">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvbitseti_w (__m256i a, imm0_31 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvbitseti.w xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_16">Description</h3>
+<p>Set the bit specified by <code>imm</code> from 32-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>
+<h3 id="examples_16">Examples</h3>
+<pre><code class="language-c++">__m256i __lasx_xvbitseti_w( __m256i{ 0x0000000000000000, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, 1)
+= 0x0000000200000002 0x99aabbceddeeff02 0xabcdef1212341236 0xaabbaabbddeeddee
+</code></pre>
+<h3 id="operation_16">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = a.word[i] | ((u32)1 &lt;&lt; imm);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_16">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvbitseti_d-__m256i-a-imm0_63-imm">__m256i __lasx_xvbitseti_d (__m256i a, imm0_63 imm)</h2>
+<h3 id="synopsis_17">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvbitseti_d (__m256i a, imm0_63 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvbitseti.d xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_17">Description</h3>
+<p>Set the bit specified by <code>imm</code> from 64-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>
+<h3 id="examples_17">Examples</h3>
+<pre><code class="language-c++">__m256i __lasx_xvbitseti_d( __m256i{ 0x0000000000000000, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, 1)
+= 0x0000000000000002 0x99aabbccddeeff02 0xabcdef1212341236 0xaabbaabbddeeddee
+</code></pre>
+<h3 id="operation_17">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = a.dword[i] | ((u64)1 &lt;&lt; imm);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_17">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvbitrev_b-__m256i-a-__m256i-b">__m256i __lasx_xvbitrev_b (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_18">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvbitrev_b (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvbitrev.b xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_18">Description</h3>
+<p>Toggle the bit specified by elements in <code>b</code> from 8-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>
+<h3 id="examples_18">Examples</h3>
+<pre><code class="language-c++">__m256i __lasx_xvbitrev_b(__m256i{0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{0xabababababababab, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678})
+= 0x0707070707070707 0x9dbabfdcd5ecf702 0xafddeb021a361a36 0xeabaeaba9def9def
+</code></pre>
+<h3 id="operation_18">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 32; i++) {
+  dst.byte[i] = a.byte[i] ^ ((u8)1 &lt;&lt; (b.byte[i] % 8));
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_18">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvbitrev_h-__m256i-a-__m256i-b">__m256i __lasx_xvbitrev_h (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_19">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvbitrev_h (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvbitrev.h xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_19">Description</h3>
+<p>Toggle the bit specified by elements in <code>b</code> from 16-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>
+<h3 id="examples_19">Examples</h3>
+<pre><code class="language-c++">__m256i __lasx_xvbitrev_h(__m256i{0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{0xabababababababab, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678})
+= 0x070f070f070f070f 0x99babbdcddecff02 0xabddef0212361236 0xabbbabbbdceedcee
+</code></pre>
+<h3 id="operation_19">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = a.half[i] ^ ((u16)1 &lt;&lt; (b.half[i] % 16));
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_19">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvbitrev_w-__m256i-a-__m256i-b">__m256i __lasx_xvbitrev_w (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_20">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvbitrev_w (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvbitrev.w xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_20">Description</h3>
+<p>Toggle the bit specified by elements in <code>b</code> from 32-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>
+<h3 id="examples_20">Examples</h3>
+<pre><code class="language-c++">__m256i __lasx_xvbitrev_w(__m256i{0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{0xabababababababab, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678})
+= 0x0f0f070f0f0f070f 0x99babbccddeeff02 0xabddef1212341236 0xabbbaabbdceeddee
+</code></pre>
+<h3 id="operation_20">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = a.word[i] ^ ((u32)1 &lt;&lt; (b.word[i] % 32));
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_20">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvbitrev_d-__m256i-a-__m256i-b">__m256i __lasx_xvbitrev_d (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_21">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvbitrev_d (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvbitrev.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_21">Description</h3>
+<p>Toggle the bit specified by elements in <code>b</code> from 64-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>
+<h3 id="examples_21">Examples</h3>
+<pre><code class="language-c++">__m256i __lasx_xvbitrev_d(__m256i{0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{0xabababababababab, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678})
+= 0x0f0f070f0f0f0f0f 0x99aabbceddeeff00 0xabcdef1012341234 0xabbbaabbddeeddee
+</code></pre>
+<h3 id="operation_21">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = a.dword[i] ^ ((u64)1 &lt;&lt; (b.dword[i] % 64));
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_21">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvbitrevi_b-__m256i-a-imm0_7-imm">__m256i __lasx_xvbitrevi_b (__m256i a, imm0_7 imm)</h2>
+<h3 id="synopsis_22">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvbitrevi_b (__m256i a, imm0_7 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvbitrevi.b xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_22">Description</h3>
+<p>Toggle the bit specified by <code>imm</code> from 8-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>
+<h3 id="examples_22">Examples</h3>
+<pre><code class="language-c++">__m256i __lasx_xvbitrevi_b( __m256i{ 0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, 1)
+= 0x0d0d0d0d0d0d0d0d 0x9ba8b9cedfecfd02 0xa9cfed1010361036 0xa8b9a8b9dfecdfec
+</code></pre>
+<h3 id="operation_22">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 32; i++) {
+  dst.byte[i] = a.byte[i] ^ ((u8)1 &lt;&lt; imm);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_22">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvbitrevi_h-__m256i-a-imm0_15-imm">__m256i __lasx_xvbitrevi_h (__m256i a, imm0_15 imm)</h2>
+<h3 id="synopsis_23">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvbitrevi_h (__m256i a, imm0_15 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvbitrevi.h xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_23">Description</h3>
+<p>Toggle the bit specified by <code>imm</code> from 16-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>
+<h3 id="examples_23">Examples</h3>
+<pre><code class="language-c++">__m256i __lasx_xvbitrevi_h( __m256i{ 0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, 1)
+= 0x0f0d0f0d0f0d0f0d 0x99a8bbceddecff02 0xabcfef1012361236 0xaab9aab9ddecddec
+</code></pre>
+<h3 id="operation_23">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = a.half[i] ^ ((u16)1 &lt;&lt; imm);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_23">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvbitrevi_w-__m256i-a-imm0_31-imm">__m256i __lasx_xvbitrevi_w (__m256i a, imm0_31 imm)</h2>
+<h3 id="synopsis_24">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvbitrevi_w (__m256i a, imm0_31 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvbitrevi.w xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_24">Description</h3>
+<p>Toggle the bit specified by <code>imm</code> from 32-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>
+<h3 id="examples_24">Examples</h3>
+<pre><code class="language-c++">__m256i __lasx_xvbitrevi_w( __m256i{ 0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, 1)
+= 0x0f0f0f0d0f0f0f0d 0x99aabbceddeeff02 0xabcdef1012341236 0xaabbaab9ddeeddec
+</code></pre>
+<h3 id="operation_24">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = a.word[i] ^ ((u32)1 &lt;&lt; imm);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_24">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvbitrevi_d-__m256i-a-imm0_63-imm">__m256i __lasx_xvbitrevi_d (__m256i a, imm0_63 imm)</h2>
+<h3 id="synopsis_25">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvbitrevi_d (__m256i a, imm0_63 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvbitrevi.d xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_25">Description</h3>
+<p>Toggle the bit specified by <code>imm</code> from 64-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>
+<h3 id="examples_25">Examples</h3>
+<pre><code class="language-c++">__m256i __lasx_xvbitrevi_d( __m256i{ 0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, 1)
+= 0x0f0f0f0f0f0f0f0d 0x99aabbccddeeff02 0xabcdef1212341236 0xaabbaabbddeeddec
+</code></pre>
+<h3 id="operation_25">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = a.dword[i] ^ ((u64)1 &lt;&lt; imm);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_25">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvclo_b-__m256i-a">__m256i __lasx_xvclo_b (__m256i a)</h2>
+<h3 id="synopsis_26">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvclo_b (__m256i a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvclo.b xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_26">Description</h3>
+<p>Count leading ones of 8-bit elements in <code>a</code>.</p>
+<h3 id="examples_26">Examples</h3>
+<pre><code class="language-c++">__m256i __lasx_xvclo_b(__m256i{ 0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee})
+= 0x0000000000000001 0x0101010202030800 0x0102030000000000 0x0101010102030203
+</code></pre>
+<h3 id="operation_26">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 32; i++) {
+  dst.byte[i] = clo(a.byte[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_26">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvclo_h-__m256i-a">__m256i __lasx_xvclo_h (__m256i a)</h2>
+<h3 id="synopsis_27">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvclo_h (__m256i a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvclo.h xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_27">Description</h3>
+<p>Count leading ones of 16-bit elements in <code>a</code>.</p>
+<h3 id="examples_27">Examples</h3>
+<pre><code class="language-c++">__m256i __lasx_xvclo_h(__m256i{ 0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee})
+= 0x0000000000000000 0x0001000100020008 0x0001000300000000 0x0001000100020002
+</code></pre>
+<h3 id="operation_27">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = clo(a.half[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_27">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvclo_w-__m256i-a">__m256i __lasx_xvclo_w (__m256i a)</h2>
+<h3 id="synopsis_28">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvclo_w (__m256i a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvclo.w xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_28">Description</h3>
+<p>Count leading ones of 32-bit elements in <code>a</code>.</p>
+<h3 id="examples_28">Examples</h3>
+<pre><code class="language-c++">__m256i __lasx_xvclo_w(__m256i{ 0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee})
+= 0x0000000000000000 0x0000000100000002 0x0000000100000000 0x0000000100000002
+</code></pre>
+<h3 id="operation_28">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = clo(a.word[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_28">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvclo_d-__m256i-a">__m256i __lasx_xvclo_d (__m256i a)</h2>
+<h3 id="synopsis_29">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvclo_d (__m256i a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvclo.d xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_29">Description</h3>
+<p>Count leading ones of 64-bit elements in <code>a</code>.</p>
+<h3 id="examples_29">Examples</h3>
+<pre><code class="language-c++">__m256i __lasx_xvclo_d(__m256i{ 0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee})
+= 0x0000000000000000 0x0000000000000001 0x0000000000000001 0x0000000000000001
+</code></pre>
+<h3 id="operation_29">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = clo(a.dword[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_29">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvclz_b-__m256i-a">__m256i __lasx_xvclz_b (__m256i a)</h2>
+<h3 id="synopsis_30">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvclz_b (__m256i a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvclz.b xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_30">Description</h3>
+<p>Count leading zeros of 8-bit elements in <code>a</code>.</p>
+<h3 id="examples_30">Examples</h3>
+<pre><code class="language-c++">__m256i __lasx_xvclz_b(__m256i{ 0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee})
+= 0x0302020101010100 0x0000000000000008 0x0000000303020302 0x0000000000000000
+</code></pre>
+<h3 id="operation_30">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 32; i++) {
+  dst.byte[i] = clz(a.byte[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_30">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvclz_h-__m256i-a">__m256i __lasx_xvclz_h (__m256i a)</h2>
+<h3 id="synopsis_31">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvclz_h (__m256i a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvclz.h xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_31">Description</h3>
+<p>Count leading zeros of 16-bit elements in <code>a</code>.</p>
+<h3 id="examples_31">Examples</h3>
+<pre><code class="language-c++">__m256i __lasx_xvclz_h(__m256i{ 0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee})
+= 0x0003000200010001 0x0000000000000000 0x0000000000030003 0x0000000000000000
+</code></pre>
+<h3 id="operation_31">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = clz(a.half[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_31">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvclz_w-__m256i-a">__m256i __lasx_xvclz_w (__m256i a)</h2>
+<h3 id="synopsis_32">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvclz_w (__m256i a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvclz.w xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_32">Description</h3>
+<p>Count leading zeros of 32-bit elements in <code>a</code>.</p>
+<h3 id="examples_32">Examples</h3>
+<pre><code class="language-c++">__m256i __lasx_xvclz_w(__m256i{ 0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee})
+= 0x0000000300000001 0x0000000000000000 0x0000000000000003 0x0000000000000000
+</code></pre>
+<h3 id="operation_32">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = clz(a.word[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_32">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvclz_d-__m256i-a">__m256i __lasx_xvclz_d (__m256i a)</h2>
+<h3 id="synopsis_33">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvclz_d (__m256i a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvclz.d xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_33">Description</h3>
+<p>Count leading zeros of 64-bit elements in <code>a</code>.</p>
+<h3 id="examples_33">Examples</h3>
+<pre><code class="language-c++">__m256i __lasx_xvclz_d(__m256i{ 0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee})
+= 0x0000000000000003 0x0000000000000000 0x0000000000000000 0x0000000000000000
+</code></pre>
+<h3 id="operation_33">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = clz(a.dword[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_33">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvpcnt_b-__m256i-a">__m256i __lasx_xvpcnt_b (__m256i a)</h2>
+<h3 id="synopsis_34">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvpcnt_b (__m256i a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvpcnt.b xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_34">Description</h3>
+<p>Count the number of ones (population, popcount) in 8-bit elements in <code>a</code>.</p>
+<h3 id="examples_34">Examples</h3>
+<pre><code class="language-c++">__m256i __lasx_xvpcnt_b(__m256i{ 0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee})
+= 0x0202040204040602 0x0404060406060800 0x0505070202030203 0x0406040606060606
+</code></pre>
+<h3 id="operation_34">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 32; i++) {
+  dst.byte[i] = popcount(a.byte[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_34">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvpcnt_h-__m256i-a">__m256i __lasx_xvpcnt_h (__m256i a)</h2>
+<h3 id="synopsis_35">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvpcnt_h (__m256i a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvpcnt.h xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_35">Description</h3>
+<p>Count the number of ones (population, popcount) in 16-bit elements in <code>a</code>.</p>
+<h3 id="examples_35">Examples</h3>
+<pre><code class="language-c++">__m256i __lasx_xvpcnt_h(__m256i{ 0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee})
+= 0x0004000600080008 0x0008000a000c0008 0x000a000900050005 0x000a000a000c000c
+</code></pre>
+<h3 id="operation_35">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = popcount(a.half[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_35">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvpcnt_w-__m256i-a">__m256i __lasx_xvpcnt_w (__m256i a)</h2>
+<h3 id="synopsis_36">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvpcnt_w (__m256i a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvpcnt.w xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_36">Description</h3>
+<p>Count the number of ones (population, popcount) in 32-bit elements in <code>a</code>.</p>
+<h3 id="examples_36">Examples</h3>
+<pre><code class="language-c++">__m256i __lasx_xvpcnt_w(__m256i{ 0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee})
+= 0x0000000a00000010 0x0000001200000014 0x000000130000000a 0x0000001400000018
+</code></pre>
+<h3 id="operation_36">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = popcount(a.word[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_36">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvpcnt_d-__m256i-a">__m256i __lasx_xvpcnt_d (__m256i a)</h2>
+<h3 id="synopsis_37">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvpcnt_d (__m256i a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvpcnt.d xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_37">Description</h3>
+<p>Count the number of ones (population, popcount) in 64-bit elements in <code>a</code>.</p>
+<h3 id="examples_37">Examples</h3>
+<pre><code class="language-c++">__m256i __lasx_xvpcnt_d(__m256i{ 0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee})
+= 0x000000000000001a 0x0000000000000026 0x000000000000001d 0x000000000000002c
+</code></pre>
+<h3 id="operation_37">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = popcount(a.dword[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_37">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+              
+            </div>
+          </div><footer>
+    <div class="rst-footer-buttons" role="navigation" aria-label="Footer Navigation">
+        <a href="../../viewer/" class="btn btn-neutral float-left" title="Browse All Intrinsics"><span class="icon icon-circle-arrow-left"></span> Previous</a>
+        <a href="../branch/" class="btn btn-neutral float-right" title="Branch">Next <span class="icon icon-circle-arrow-right"></span></a>
+    </div>
+
+  <hr/>
+
+  <div role="contentinfo">
+    <!-- Copyright etc -->
+      <p>Copyright &copy; 2023 Jiajie Chen</p>
+  </div>
+
+  Built with <a href="https://www.mkdocs.org/">MkDocs</a> using a <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
+</footer>
+          
+        </div>
+      </div>
+
+    </section>
+
+  </div>
+
+  <div class="rst-versions" role="note" aria-label="Versions">
+  <span class="rst-current-version" data-toggle="rst-current-version">
+    
+    
+      <span><a href="../../viewer/" style="color: #fcfcfc">&laquo; Previous</a></span>
+    
+    
+      <span><a href="../branch/" style="color: #fcfcfc">Next &raquo;</a></span>
+    
+  </span>
+</div>
+    <script src="../../js/jquery-3.6.0.min.js"></script>
+    <script>var base_url = "../..";</script>
+    <script src="../../js/theme_extra.js"></script>
+    <script src="../../js/theme.js"></script>
+    <script>
+        jQuery(function () {
+            SphinxRtdTheme.Navigation.enable(true);
+        });
+    </script>
+
+</body>
+</html>
diff --git a/lasx/branch/index.html b/lasx/branch/index.html
new file mode 100644
index 00000000..77ee7cf8
--- /dev/null
+++ b/lasx/branch/index.html
@@ -0,0 +1,709 @@
+<!DOCTYPE html>
+<html class="writer-html5" lang="en" >
+<head>
+    <meta charset="utf-8" />
+    <meta http-equiv="X-UA-Compatible" content="IE=edge" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" /><link rel="canonical" href="https://jia.je/unofficial-loongarch-intrinsics-guide/lasx/branch/" />
+      <link rel="shortcut icon" href="../../img/favicon.ico" />
+    <title>Branch - Unofficial LoongArch Intrinsics Guide</title>
+    <link rel="stylesheet" href="../../css/theme.css" />
+    <link rel="stylesheet" href="../../css/theme_extra.css" />
+        <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.8.0/styles/github.min.css" />
+        <link href="../../main.css" rel="stylesheet" />
+    
+      <script>
+        // Current page data
+        var mkdocs_page_name = "Branch";
+        var mkdocs_page_input_path = "lasx/branch.md";
+        var mkdocs_page_url = "/unofficial-loongarch-intrinsics-guide/lasx/branch/";
+      </script>
+    
+    <!--[if lt IE 9]>
+      <script src="../../js/html5shiv.min.js"></script>
+    <![endif]-->
+      <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.8.0/highlight.min.js"></script>
+      <script>hljs.highlightAll();</script> 
+</head>
+
+<body class="wy-body-for-nav" role="document">
+
+  <div class="wy-grid-for-nav">
+    <nav data-toggle="wy-nav-shift" class="wy-nav-side stickynav">
+    <div class="wy-side-scroll">
+      <div class="wy-side-nav-search">
+          <a href="../.." class="icon icon-home"> Unofficial LoongArch Intrinsics Guide
+        </a>
+      </div>
+
+      <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../..">Unofficial LoongArch Intrinsics Guide</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../latency_throughput/">Latency and Throughput of Instructions</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../migrating_avx/">Migrating from AVX to LASX</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../migrating_sse/">Migrating from SSE to LSX</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../viewer/">Browse All Intrinsics</a>
+                </li>
+              </ul>
+              <p class="caption"><span class="caption-text">Lasx</span></p>
+              <ul class="current">
+                  <li class="toctree-l1"><a class="reference internal" href="../bitwise_operations/">Bitwise Operations</a>
+                  </li>
+                  <li class="toctree-l1 current"><a class="reference internal current" href="./">Branch</a>
+    <ul class="current">
+    <li class="toctree-l2"><a class="reference internal" href="#int-__lasx_xbz_v-__m256i-a">int __lasx_xbz_v (__m256i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#int-__lasx_xbnz_v-__m256i-a">int __lasx_xbnz_v (__m256i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_1">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_1">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_1">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_1">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#int-__lasx_xbz_b-__m256i-a">int __lasx_xbz_b (__m256i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_2">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_2">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_2">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_2">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#int-__lasx_xbz_h-__m256i-a">int __lasx_xbz_h (__m256i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_3">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_3">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_3">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_3">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#int-__lasx_xbz_w-__m256i-a">int __lasx_xbz_w (__m256i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_4">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_4">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_4">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_4">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#int-__lasx_xbz_d-__m256i-a">int __lasx_xbz_d (__m256i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_5">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_5">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_5">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_5">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#int-__lasx_xbnz_b-__m256i-a">int __lasx_xbnz_b (__m256i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_6">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_6">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_6">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_6">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#int-__lasx_xbnz_h-__m256i-a">int __lasx_xbnz_h (__m256i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_7">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_7">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_7">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_7">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#int-__lasx_xbnz_w-__m256i-a">int __lasx_xbnz_w (__m256i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_8">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_8">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_8">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_8">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#int-__lasx_xbnz_d-__m256i-a">int __lasx_xbnz_d (__m256i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_9">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_9">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_9">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_9">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    </ul>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_comparison/">Floating Point Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_computation/">Floating Point Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_conversion/">Floating Point Conversion</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_misc/">Floating Point Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../fma/">Fused Multiply-Add</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../integer_comparison/">Integer Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../integer_computation/">Integer Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../logical/">Logical</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../memory/">Memory Load & Store</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../misc/">Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../permutation/">Permutation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../shift/">Shift</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../shuffling/">Shuffling</a>
+                  </li>
+              </ul>
+              <p class="caption"><span class="caption-text">Lsx</span></p>
+              <ul>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/bitwise_operations/">Bitwise Operations</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/branch/">Branch</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/float_comparison/">Floating Point Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/float_computation/">Floating Point Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/float_conversion/">Floating Point Conversion</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/float_misc/">Floating Point Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/fma/">Fused Multiply-Add</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/integer_comparison/">Integer Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/integer_computation/">Integer Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/logical/">Logical</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/memory/">Memory Load & Store</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/misc/">Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/permutation/">Permutation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/shift/">Shift</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/shuffling/">Shuffling</a>
+                  </li>
+              </ul>
+      </div>
+    </div>
+    </nav>
+
+    <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
+      <nav class="wy-nav-top" role="navigation" aria-label="Mobile navigation menu">
+          <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
+          <a href="../..">Unofficial LoongArch Intrinsics Guide</a>
+        
+      </nav>
+      <div class="wy-nav-content">
+        <div class="rst-content"><div role="navigation" aria-label="breadcrumbs navigation">
+  <ul class="wy-breadcrumbs">
+    <li><a href="../.." class="icon icon-home" aria-label="Docs"></a></li>
+          <li class="breadcrumb-item">Lasx</li>
+      <li class="breadcrumb-item active">Branch</li>
+    <li class="wy-breadcrumbs-aside">
+    </li>
+  </ul>
+  <hr/>
+</div>
+          <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
+            <div class="section" itemprop="articleBody">
+              
+                <h1 id="branch">Branch</h1>
+<h2 id="int-__lasx_xbz_v-__m256i-a">int __lasx_xbz_v (__m256i a)</h2>
+<h3 id="synopsis">Synopsis</h3>
+<pre><code class="language-c++">int __lasx_xbz_v (__m256i a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvseteqz.v fcc, xr; bcnez
+CPU Flags: LASX
+</code></pre>
+<h3 id="description">Description</h3>
+<p>Expected to be used in branches: branch if the whole vector <code>a</code> equals to zero.</p>
+<h3 id="operation">Operation</h3>
+<pre><code class="language-c++">dst = a.qword[0] == 0 &amp;&amp; a.qword[1] == 0;
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>N/A</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>N/A</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="int-__lasx_xbnz_v-__m256i-a">int __lasx_xbnz_v (__m256i a)</h2>
+<h3 id="synopsis_1">Synopsis</h3>
+<pre><code class="language-c++">int __lasx_xbnz_v (__m256i a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsetnez.v fcc, xr; bcnez
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_1">Description</h3>
+<p>Expected to be used in branches: branch if the whole vector <code>a</code> is non-zero.</p>
+<h3 id="operation_1">Operation</h3>
+<pre><code class="language-c++">dst = a.qword[0] != 0 || a.qword[1] != 0;
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_1">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>N/A</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>N/A</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="int-__lasx_xbz_b-__m256i-a">int __lasx_xbz_b (__m256i a)</h2>
+<h3 id="synopsis_2">Synopsis</h3>
+<pre><code class="language-c++">int __lasx_xbz_b (__m256i a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsetanyeqz.b fcc, xr; bcnez
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_2">Description</h3>
+<p>Expected to be used in branches: branch if any 8-bit element in <code>a</code> equals to zero.</p>
+<h3 id="operation_2">Operation</h3>
+<pre><code class="language-c++">dst = 0;
+for (int i = 0; i &lt; 32; i++) {
+  if (a.byte[i] == 0) {
+    dst = 1;
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_2">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>N/A</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>N/A</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="int-__lasx_xbz_h-__m256i-a">int __lasx_xbz_h (__m256i a)</h2>
+<h3 id="synopsis_3">Synopsis</h3>
+<pre><code class="language-c++">int __lasx_xbz_h (__m256i a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsetanyeqz.h fcc, xr; bcnez
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_3">Description</h3>
+<p>Expected to be used in branches: branch if any 16-bit element in <code>a</code> equals to zero.</p>
+<h3 id="operation_3">Operation</h3>
+<pre><code class="language-c++">dst = 0;
+for (int i = 0; i &lt; 16; i++) {
+  if (a.half[i] == 0) {
+    dst = 1;
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_3">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>N/A</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>N/A</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="int-__lasx_xbz_w-__m256i-a">int __lasx_xbz_w (__m256i a)</h2>
+<h3 id="synopsis_4">Synopsis</h3>
+<pre><code class="language-c++">int __lasx_xbz_w (__m256i a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsetanyeqz.w fcc, xr; bcnez
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_4">Description</h3>
+<p>Expected to be used in branches: branch if any 32-bit element in <code>a</code> equals to zero.</p>
+<h3 id="operation_4">Operation</h3>
+<pre><code class="language-c++">dst = 0;
+for (int i = 0; i &lt; 8; i++) {
+  if (a.word[i] == 0) {
+    dst = 1;
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_4">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>N/A</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>N/A</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="int-__lasx_xbz_d-__m256i-a">int __lasx_xbz_d (__m256i a)</h2>
+<h3 id="synopsis_5">Synopsis</h3>
+<pre><code class="language-c++">int __lasx_xbz_d (__m256i a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsetanyeqz.d fcc, xr; bcnez
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_5">Description</h3>
+<p>Expected to be used in branches: branch if any 64-bit element in <code>a</code> equals to zero.</p>
+<h3 id="operation_5">Operation</h3>
+<pre><code class="language-c++">dst = 0;
+for (int i = 0; i &lt; 4; i++) {
+  if (a.dword[i] == 0) {
+    dst = 1;
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_5">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>N/A</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>N/A</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="int-__lasx_xbnz_b-__m256i-a">int __lasx_xbnz_b (__m256i a)</h2>
+<h3 id="synopsis_6">Synopsis</h3>
+<pre><code class="language-c++">int __lasx_xbnz_b (__m256i a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsetallnez.b fcc, xr; bcnez
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_6">Description</h3>
+<p>Expected to be used in branches: branch if all 8-bit elements in <code>a</code> are non-zero.</p>
+<h3 id="operation_6">Operation</h3>
+<pre><code class="language-c++">dst = 1;
+for (int i = 0; i &lt; 32; i++) {
+  if (a.byte[i] == 0) {
+    dst = 0;
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_6">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>N/A</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>N/A</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="int-__lasx_xbnz_h-__m256i-a">int __lasx_xbnz_h (__m256i a)</h2>
+<h3 id="synopsis_7">Synopsis</h3>
+<pre><code class="language-c++">int __lasx_xbnz_h (__m256i a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsetallnez.h fcc, xr; bcnez
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_7">Description</h3>
+<p>Expected to be used in branches: branch if all 16-bit elements in <code>a</code> are non-zero.</p>
+<h3 id="operation_7">Operation</h3>
+<pre><code class="language-c++">dst = 1;
+for (int i = 0; i &lt; 16; i++) {
+  if (a.half[i] == 0) {
+    dst = 0;
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_7">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>N/A</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>N/A</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="int-__lasx_xbnz_w-__m256i-a">int __lasx_xbnz_w (__m256i a)</h2>
+<h3 id="synopsis_8">Synopsis</h3>
+<pre><code class="language-c++">int __lasx_xbnz_w (__m256i a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsetallnez.w fcc, xr; bcnez
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_8">Description</h3>
+<p>Expected to be used in branches: branch if all 32-bit elements in <code>a</code> are non-zero.</p>
+<h3 id="operation_8">Operation</h3>
+<pre><code class="language-c++">dst = 1;
+for (int i = 0; i &lt; 8; i++) {
+  if (a.word[i] == 0) {
+    dst = 0;
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_8">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>N/A</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>N/A</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="int-__lasx_xbnz_d-__m256i-a">int __lasx_xbnz_d (__m256i a)</h2>
+<h3 id="synopsis_9">Synopsis</h3>
+<pre><code class="language-c++">int __lasx_xbnz_d (__m256i a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsetallnez.d fcc, xr; bcnez
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_9">Description</h3>
+<p>Expected to be used in branches: branch if all 64-bit elements in <code>a</code> are non-zero.</p>
+<h3 id="operation_9">Operation</h3>
+<pre><code class="language-c++">dst = 1;
+for (int i = 0; i &lt; 4; i++) {
+  if (a.dword[i] == 0) {
+    dst = 0;
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_9">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>N/A</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>N/A</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+              
+            </div>
+          </div><footer>
+    <div class="rst-footer-buttons" role="navigation" aria-label="Footer Navigation">
+        <a href="../bitwise_operations/" class="btn btn-neutral float-left" title="Bitwise Operations"><span class="icon icon-circle-arrow-left"></span> Previous</a>
+        <a href="../float_comparison/" class="btn btn-neutral float-right" title="Floating Point Comparison">Next <span class="icon icon-circle-arrow-right"></span></a>
+    </div>
+
+  <hr/>
+
+  <div role="contentinfo">
+    <!-- Copyright etc -->
+      <p>Copyright &copy; 2023 Jiajie Chen</p>
+  </div>
+
+  Built with <a href="https://www.mkdocs.org/">MkDocs</a> using a <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
+</footer>
+          
+        </div>
+      </div>
+
+    </section>
+
+  </div>
+
+  <div class="rst-versions" role="note" aria-label="Versions">
+  <span class="rst-current-version" data-toggle="rst-current-version">
+    
+    
+      <span><a href="../bitwise_operations/" style="color: #fcfcfc">&laquo; Previous</a></span>
+    
+    
+      <span><a href="../float_comparison/" style="color: #fcfcfc">Next &raquo;</a></span>
+    
+  </span>
+</div>
+    <script src="../../js/jquery-3.6.0.min.js"></script>
+    <script>var base_url = "../..";</script>
+    <script src="../../js/theme_extra.js"></script>
+    <script src="../../js/theme.js"></script>
+    <script>
+        jQuery(function () {
+            SphinxRtdTheme.Navigation.enable(true);
+        });
+    </script>
+
+</body>
+</html>
diff --git a/lasx/float_comparison/index.html b/lasx/float_comparison/index.html
new file mode 100644
index 00000000..a69fb159
--- /dev/null
+++ b/lasx/float_comparison/index.html
@@ -0,0 +1,2443 @@
+<!DOCTYPE html>
+<html class="writer-html5" lang="en" >
+<head>
+    <meta charset="utf-8" />
+    <meta http-equiv="X-UA-Compatible" content="IE=edge" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" /><link rel="canonical" href="https://jia.je/unofficial-loongarch-intrinsics-guide/lasx/float_comparison/" />
+      <link rel="shortcut icon" href="../../img/favicon.ico" />
+    <title>Floating Point Comparison - Unofficial LoongArch Intrinsics Guide</title>
+    <link rel="stylesheet" href="../../css/theme.css" />
+    <link rel="stylesheet" href="../../css/theme_extra.css" />
+        <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.8.0/styles/github.min.css" />
+        <link href="../../main.css" rel="stylesheet" />
+    
+      <script>
+        // Current page data
+        var mkdocs_page_name = "Floating Point Comparison";
+        var mkdocs_page_input_path = "lasx/float_comparison.md";
+        var mkdocs_page_url = "/unofficial-loongarch-intrinsics-guide/lasx/float_comparison/";
+      </script>
+    
+    <!--[if lt IE 9]>
+      <script src="../../js/html5shiv.min.js"></script>
+    <![endif]-->
+      <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.8.0/highlight.min.js"></script>
+      <script>hljs.highlightAll();</script> 
+</head>
+
+<body class="wy-body-for-nav" role="document">
+
+  <div class="wy-grid-for-nav">
+    <nav data-toggle="wy-nav-shift" class="wy-nav-side stickynav">
+    <div class="wy-side-scroll">
+      <div class="wy-side-nav-search">
+          <a href="../.." class="icon icon-home"> Unofficial LoongArch Intrinsics Guide
+        </a>
+      </div>
+
+      <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../..">Unofficial LoongArch Intrinsics Guide</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../latency_throughput/">Latency and Throughput of Instructions</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../migrating_avx/">Migrating from AVX to LASX</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../migrating_sse/">Migrating from SSE to LSX</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../viewer/">Browse All Intrinsics</a>
+                </li>
+              </ul>
+              <p class="caption"><span class="caption-text">Lasx</span></p>
+              <ul class="current">
+                  <li class="toctree-l1"><a class="reference internal" href="../bitwise_operations/">Bitwise Operations</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../branch/">Branch</a>
+                  </li>
+                  <li class="toctree-l1 current"><a class="reference internal current" href="./">Floating Point Comparison</a>
+    <ul class="current">
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvfcmp_caf_s-__m256-a-__m256-b">__m256i __lasx_xvfcmp_caf_s (__m256 a, __m256 b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvfcmp_caf_d-__m256d-a-__m256d-b">__m256i __lasx_xvfcmp_caf_d (__m256d a, __m256d b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_1">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_1">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_1">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_1">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvfcmp_ceq_s-__m256-a-__m256-b">__m256i __lasx_xvfcmp_ceq_s (__m256 a, __m256 b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_2">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_2">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_2">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_2">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvfcmp_ceq_d-__m256d-a-__m256d-b">__m256i __lasx_xvfcmp_ceq_d (__m256d a, __m256d b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_3">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_3">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_3">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_3">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvfcmp_cle_s-__m256-a-__m256-b">__m256i __lasx_xvfcmp_cle_s (__m256 a, __m256 b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_4">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_4">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_4">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_4">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvfcmp_cle_d-__m256d-a-__m256d-b">__m256i __lasx_xvfcmp_cle_d (__m256d a, __m256d b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_5">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_5">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_5">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_5">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvfcmp_clt_s-__m256-a-__m256-b">__m256i __lasx_xvfcmp_clt_s (__m256 a, __m256 b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_6">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_6">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_6">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_6">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvfcmp_clt_d-__m256d-a-__m256d-b">__m256i __lasx_xvfcmp_clt_d (__m256d a, __m256d b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_7">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_7">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_7">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_7">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvfcmp_cne_s-__m256-a-__m256-b">__m256i __lasx_xvfcmp_cne_s (__m256 a, __m256 b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_8">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_8">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_8">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_8">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvfcmp_cne_d-__m256d-a-__m256d-b">__m256i __lasx_xvfcmp_cne_d (__m256d a, __m256d b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_9">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_9">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_9">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_9">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvfcmp_cor_s-__m256-a-__m256-b">__m256i __lasx_xvfcmp_cor_s (__m256 a, __m256 b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_10">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_10">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_10">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_10">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvfcmp_cor_d-__m256d-a-__m256d-b">__m256i __lasx_xvfcmp_cor_d (__m256d a, __m256d b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_11">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_11">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_11">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_11">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvfcmp_cueq_s-__m256-a-__m256-b">__m256i __lasx_xvfcmp_cueq_s (__m256 a, __m256 b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_12">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_12">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_12">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_12">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvfcmp_cueq_d-__m256d-a-__m256d-b">__m256i __lasx_xvfcmp_cueq_d (__m256d a, __m256d b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_13">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_13">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_13">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_13">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvfcmp_cule_s-__m256-a-__m256-b">__m256i __lasx_xvfcmp_cule_s (__m256 a, __m256 b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_14">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_14">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_14">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_14">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvfcmp_cule_d-__m256d-a-__m256d-b">__m256i __lasx_xvfcmp_cule_d (__m256d a, __m256d b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_15">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_15">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_15">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_15">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvfcmp_cult_s-__m256-a-__m256-b">__m256i __lasx_xvfcmp_cult_s (__m256 a, __m256 b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_16">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_16">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_16">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_16">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvfcmp_cult_d-__m256d-a-__m256d-b">__m256i __lasx_xvfcmp_cult_d (__m256d a, __m256d b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_17">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_17">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_17">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_17">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvfcmp_cun_s-__m256-a-__m256-b">__m256i __lasx_xvfcmp_cun_s (__m256 a, __m256 b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_18">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_18">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_18">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_18">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvfcmp_cun_d-__m256d-a-__m256d-b">__m256i __lasx_xvfcmp_cun_d (__m256d a, __m256d b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_19">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_19">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_19">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_19">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvfcmp_cune_s-__m256-a-__m256-b">__m256i __lasx_xvfcmp_cune_s (__m256 a, __m256 b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_20">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_20">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_20">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_20">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvfcmp_cune_d-__m256d-a-__m256d-b">__m256i __lasx_xvfcmp_cune_d (__m256d a, __m256d b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_21">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_21">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_21">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_21">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvfcmp_saf_s-__m256-a-__m256-b">__m256i __lasx_xvfcmp_saf_s (__m256 a, __m256 b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_22">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_22">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_22">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_22">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvfcmp_saf_d-__m256d-a-__m256d-b">__m256i __lasx_xvfcmp_saf_d (__m256d a, __m256d b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_23">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_23">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_23">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_23">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvfcmp_seq_s-__m256-a-__m256-b">__m256i __lasx_xvfcmp_seq_s (__m256 a, __m256 b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_24">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_24">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_24">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_24">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvfcmp_seq_d-__m256d-a-__m256d-b">__m256i __lasx_xvfcmp_seq_d (__m256d a, __m256d b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_25">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_25">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_25">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_25">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvfcmp_sle_s-__m256-a-__m256-b">__m256i __lasx_xvfcmp_sle_s (__m256 a, __m256 b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_26">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_26">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_26">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_26">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvfcmp_sle_d-__m256d-a-__m256d-b">__m256i __lasx_xvfcmp_sle_d (__m256d a, __m256d b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_27">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_27">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_27">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_27">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvfcmp_slt_s-__m256-a-__m256-b">__m256i __lasx_xvfcmp_slt_s (__m256 a, __m256 b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_28">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_28">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_28">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_28">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvfcmp_slt_d-__m256d-a-__m256d-b">__m256i __lasx_xvfcmp_slt_d (__m256d a, __m256d b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_29">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_29">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_29">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_29">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvfcmp_sne_s-__m256-a-__m256-b">__m256i __lasx_xvfcmp_sne_s (__m256 a, __m256 b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_30">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_30">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_30">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_30">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvfcmp_sne_d-__m256d-a-__m256d-b">__m256i __lasx_xvfcmp_sne_d (__m256d a, __m256d b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_31">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_31">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_31">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_31">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvfcmp_sor_s-__m256-a-__m256-b">__m256i __lasx_xvfcmp_sor_s (__m256 a, __m256 b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_32">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_32">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_32">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_32">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvfcmp_sor_d-__m256d-a-__m256d-b">__m256i __lasx_xvfcmp_sor_d (__m256d a, __m256d b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_33">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_33">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_33">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_33">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvfcmp_sueq_s-__m256-a-__m256-b">__m256i __lasx_xvfcmp_sueq_s (__m256 a, __m256 b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_34">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_34">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_34">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_34">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvfcmp_sueq_d-__m256d-a-__m256d-b">__m256i __lasx_xvfcmp_sueq_d (__m256d a, __m256d b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_35">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_35">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_35">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_35">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvfcmp_sule_s-__m256-a-__m256-b">__m256i __lasx_xvfcmp_sule_s (__m256 a, __m256 b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_36">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_36">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_36">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_36">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvfcmp_sule_d-__m256d-a-__m256d-b">__m256i __lasx_xvfcmp_sule_d (__m256d a, __m256d b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_37">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_37">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_37">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_37">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvfcmp_sult_s-__m256-a-__m256-b">__m256i __lasx_xvfcmp_sult_s (__m256 a, __m256 b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_38">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_38">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_38">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_38">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvfcmp_sult_d-__m256d-a-__m256d-b">__m256i __lasx_xvfcmp_sult_d (__m256d a, __m256d b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_39">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_39">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_39">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_39">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvfcmp_sun_s-__m256-a-__m256-b">__m256i __lasx_xvfcmp_sun_s (__m256 a, __m256 b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_40">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_40">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_40">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_40">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvfcmp_sun_d-__m256d-a-__m256d-b">__m256i __lasx_xvfcmp_sun_d (__m256d a, __m256d b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_41">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_41">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_41">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_41">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvfcmp_sune_s-__m256-a-__m256-b">__m256i __lasx_xvfcmp_sune_s (__m256 a, __m256 b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_42">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_42">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_42">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_42">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvfcmp_sune_d-__m256d-a-__m256d-b">__m256i __lasx_xvfcmp_sune_d (__m256d a, __m256d b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_43">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_43">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_43">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_43">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    </ul>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_computation/">Floating Point Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_conversion/">Floating Point Conversion</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_misc/">Floating Point Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../fma/">Fused Multiply-Add</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../integer_comparison/">Integer Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../integer_computation/">Integer Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../logical/">Logical</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../memory/">Memory Load & Store</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../misc/">Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../permutation/">Permutation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../shift/">Shift</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../shuffling/">Shuffling</a>
+                  </li>
+              </ul>
+              <p class="caption"><span class="caption-text">Lsx</span></p>
+              <ul>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/bitwise_operations/">Bitwise Operations</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/branch/">Branch</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/float_comparison/">Floating Point Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/float_computation/">Floating Point Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/float_conversion/">Floating Point Conversion</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/float_misc/">Floating Point Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/fma/">Fused Multiply-Add</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/integer_comparison/">Integer Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/integer_computation/">Integer Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/logical/">Logical</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/memory/">Memory Load & Store</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/misc/">Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/permutation/">Permutation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/shift/">Shift</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/shuffling/">Shuffling</a>
+                  </li>
+              </ul>
+      </div>
+    </div>
+    </nav>
+
+    <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
+      <nav class="wy-nav-top" role="navigation" aria-label="Mobile navigation menu">
+          <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
+          <a href="../..">Unofficial LoongArch Intrinsics Guide</a>
+        
+      </nav>
+      <div class="wy-nav-content">
+        <div class="rst-content"><div role="navigation" aria-label="breadcrumbs navigation">
+  <ul class="wy-breadcrumbs">
+    <li><a href="../.." class="icon icon-home" aria-label="Docs"></a></li>
+          <li class="breadcrumb-item">Lasx</li>
+      <li class="breadcrumb-item active">Floating Point Comparison</li>
+    <li class="wy-breadcrumbs-aside">
+    </li>
+  </ul>
+  <hr/>
+</div>
+          <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
+            <div class="section" itemprop="articleBody">
+              
+                <h1 id="floating-point-comparison">Floating Point Comparison</h1>
+<h2 id="__m256i-__lasx_xvfcmp_caf_s-__m256-a-__m256-b">__m256i __lasx_xvfcmp_caf_s (__m256 a, __m256 b)</h2>
+<h3 id="synopsis">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvfcmp_caf_s (__m256 a, __m256 b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfcmp.caf.s xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description">Description</h3>
+<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if AF(Always False), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>
+<h3 id="operation">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  if (fp_compare_caf(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvfcmp_caf_d-__m256d-a-__m256d-b">__m256i __lasx_xvfcmp_caf_d (__m256d a, __m256d b)</h2>
+<h3 id="synopsis_1">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvfcmp_caf_d (__m256d a, __m256d b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfcmp.caf.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_1">Description</h3>
+<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if AF(Always False), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>
+<h3 id="operation_1">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (fp_compare_caf(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_1">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvfcmp_ceq_s-__m256-a-__m256-b">__m256i __lasx_xvfcmp_ceq_s (__m256 a, __m256 b)</h2>
+<h3 id="synopsis_2">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvfcmp_ceq_s (__m256 a, __m256 b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfcmp.ceq.s xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_2">Description</h3>
+<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if EQ(Equal), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>
+<h3 id="operation_2">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  if (fp_compare_ceq(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_2">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvfcmp_ceq_d-__m256d-a-__m256d-b">__m256i __lasx_xvfcmp_ceq_d (__m256d a, __m256d b)</h2>
+<h3 id="synopsis_3">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvfcmp_ceq_d (__m256d a, __m256d b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfcmp.ceq.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_3">Description</h3>
+<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if EQ(Equal), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>
+<h3 id="operation_3">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (fp_compare_ceq(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_3">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvfcmp_cle_s-__m256-a-__m256-b">__m256i __lasx_xvfcmp_cle_s (__m256 a, __m256 b)</h2>
+<h3 id="synopsis_4">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvfcmp_cle_s (__m256 a, __m256 b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfcmp.cle.s xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_4">Description</h3>
+<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if LE(Less than or Equal), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>
+<h3 id="operation_4">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  if (fp_compare_cle(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_4">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvfcmp_cle_d-__m256d-a-__m256d-b">__m256i __lasx_xvfcmp_cle_d (__m256d a, __m256d b)</h2>
+<h3 id="synopsis_5">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvfcmp_cle_d (__m256d a, __m256d b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfcmp.cle.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_5">Description</h3>
+<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if LE(Less than or Equal), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>
+<h3 id="operation_5">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (fp_compare_cle(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_5">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvfcmp_clt_s-__m256-a-__m256-b">__m256i __lasx_xvfcmp_clt_s (__m256 a, __m256 b)</h2>
+<h3 id="synopsis_6">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvfcmp_clt_s (__m256 a, __m256 b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfcmp.clt.s xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_6">Description</h3>
+<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if LT(Less than), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>
+<h3 id="operation_6">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  if (fp_compare_clt(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_6">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvfcmp_clt_d-__m256d-a-__m256d-b">__m256i __lasx_xvfcmp_clt_d (__m256d a, __m256d b)</h2>
+<h3 id="synopsis_7">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvfcmp_clt_d (__m256d a, __m256d b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfcmp.clt.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_7">Description</h3>
+<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if LT(Less than), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>
+<h3 id="operation_7">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (fp_compare_clt(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_7">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvfcmp_cne_s-__m256-a-__m256-b">__m256i __lasx_xvfcmp_cne_s (__m256 a, __m256 b)</h2>
+<h3 id="synopsis_8">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvfcmp_cne_s (__m256 a, __m256 b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfcmp.cne.s xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_8">Description</h3>
+<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if NE(Not Equal), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>
+<h3 id="operation_8">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  if (fp_compare_cne(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_8">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvfcmp_cne_d-__m256d-a-__m256d-b">__m256i __lasx_xvfcmp_cne_d (__m256d a, __m256d b)</h2>
+<h3 id="synopsis_9">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvfcmp_cne_d (__m256d a, __m256d b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfcmp.cne.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_9">Description</h3>
+<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if NE(Not Equal), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>
+<h3 id="operation_9">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (fp_compare_cne(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_9">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvfcmp_cor_s-__m256-a-__m256-b">__m256i __lasx_xvfcmp_cor_s (__m256 a, __m256 b)</h2>
+<h3 id="synopsis_10">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvfcmp_cor_s (__m256 a, __m256 b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfcmp.cor.s xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_10">Description</h3>
+<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if OR(Ordered), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>
+<h3 id="operation_10">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  if (fp_compare_cor(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_10">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvfcmp_cor_d-__m256d-a-__m256d-b">__m256i __lasx_xvfcmp_cor_d (__m256d a, __m256d b)</h2>
+<h3 id="synopsis_11">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvfcmp_cor_d (__m256d a, __m256d b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfcmp.cor.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_11">Description</h3>
+<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if OR(Ordered), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>
+<h3 id="operation_11">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (fp_compare_cor(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_11">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvfcmp_cueq_s-__m256-a-__m256-b">__m256i __lasx_xvfcmp_cueq_s (__m256 a, __m256 b)</h2>
+<h3 id="synopsis_12">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvfcmp_cueq_s (__m256 a, __m256 b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfcmp.cueq.s xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_12">Description</h3>
+<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if UEQ(Unordered or Equal), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>
+<h3 id="operation_12">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  if (fp_compare_cueq(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_12">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvfcmp_cueq_d-__m256d-a-__m256d-b">__m256i __lasx_xvfcmp_cueq_d (__m256d a, __m256d b)</h2>
+<h3 id="synopsis_13">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvfcmp_cueq_d (__m256d a, __m256d b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfcmp.cueq.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_13">Description</h3>
+<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if UEQ(Unordered or Equal), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>
+<h3 id="operation_13">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (fp_compare_cueq(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_13">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvfcmp_cule_s-__m256-a-__m256-b">__m256i __lasx_xvfcmp_cule_s (__m256 a, __m256 b)</h2>
+<h3 id="synopsis_14">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvfcmp_cule_s (__m256 a, __m256 b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfcmp.cule.s xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_14">Description</h3>
+<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if ULE(Unordered, Less than or Equal), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>
+<h3 id="operation_14">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  if (fp_compare_cule(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_14">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvfcmp_cule_d-__m256d-a-__m256d-b">__m256i __lasx_xvfcmp_cule_d (__m256d a, __m256d b)</h2>
+<h3 id="synopsis_15">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvfcmp_cule_d (__m256d a, __m256d b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfcmp.cule.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_15">Description</h3>
+<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if ULE(Unordered, Less than or Equal), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>
+<h3 id="operation_15">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (fp_compare_cule(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_15">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvfcmp_cult_s-__m256-a-__m256-b">__m256i __lasx_xvfcmp_cult_s (__m256 a, __m256 b)</h2>
+<h3 id="synopsis_16">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvfcmp_cult_s (__m256 a, __m256 b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfcmp.cult.s xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_16">Description</h3>
+<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if ULT(Unordered or Less than), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>
+<h3 id="operation_16">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  if (fp_compare_cult(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_16">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvfcmp_cult_d-__m256d-a-__m256d-b">__m256i __lasx_xvfcmp_cult_d (__m256d a, __m256d b)</h2>
+<h3 id="synopsis_17">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvfcmp_cult_d (__m256d a, __m256d b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfcmp.cult.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_17">Description</h3>
+<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if ULT(Unordered or Less than), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>
+<h3 id="operation_17">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (fp_compare_cult(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_17">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvfcmp_cun_s-__m256-a-__m256-b">__m256i __lasx_xvfcmp_cun_s (__m256 a, __m256 b)</h2>
+<h3 id="synopsis_18">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvfcmp_cun_s (__m256 a, __m256 b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfcmp.cun.s xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_18">Description</h3>
+<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if UN(Unordered), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>
+<h3 id="operation_18">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  if (fp_compare_cun(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_18">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvfcmp_cun_d-__m256d-a-__m256d-b">__m256i __lasx_xvfcmp_cun_d (__m256d a, __m256d b)</h2>
+<h3 id="synopsis_19">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvfcmp_cun_d (__m256d a, __m256d b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfcmp.cun.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_19">Description</h3>
+<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if UN(Unordered), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>
+<h3 id="operation_19">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (fp_compare_cun(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_19">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvfcmp_cune_s-__m256-a-__m256-b">__m256i __lasx_xvfcmp_cune_s (__m256 a, __m256 b)</h2>
+<h3 id="synopsis_20">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvfcmp_cune_s (__m256 a, __m256 b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfcmp.cune.s xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_20">Description</h3>
+<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if UNE(Unordered or Not Equal), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>
+<h3 id="operation_20">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  if (fp_compare_cune(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_20">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvfcmp_cune_d-__m256d-a-__m256d-b">__m256i __lasx_xvfcmp_cune_d (__m256d a, __m256d b)</h2>
+<h3 id="synopsis_21">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvfcmp_cune_d (__m256d a, __m256d b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfcmp.cune.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_21">Description</h3>
+<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if UNE(Unordered or Not Equal), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>
+<h3 id="operation_21">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (fp_compare_cune(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_21">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvfcmp_saf_s-__m256-a-__m256-b">__m256i __lasx_xvfcmp_saf_s (__m256 a, __m256 b)</h2>
+<h3 id="synopsis_22">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvfcmp_saf_s (__m256 a, __m256 b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfcmp.saf.s xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_22">Description</h3>
+<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if AF(Always False), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>
+<h3 id="operation_22">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  if (fp_compare_saf(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_22">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvfcmp_saf_d-__m256d-a-__m256d-b">__m256i __lasx_xvfcmp_saf_d (__m256d a, __m256d b)</h2>
+<h3 id="synopsis_23">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvfcmp_saf_d (__m256d a, __m256d b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfcmp.saf.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_23">Description</h3>
+<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if AF(Always False), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>
+<h3 id="operation_23">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (fp_compare_saf(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_23">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvfcmp_seq_s-__m256-a-__m256-b">__m256i __lasx_xvfcmp_seq_s (__m256 a, __m256 b)</h2>
+<h3 id="synopsis_24">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvfcmp_seq_s (__m256 a, __m256 b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfcmp.seq.s xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_24">Description</h3>
+<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if EQ(Equal), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>
+<h3 id="operation_24">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  if (fp_compare_seq(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_24">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvfcmp_seq_d-__m256d-a-__m256d-b">__m256i __lasx_xvfcmp_seq_d (__m256d a, __m256d b)</h2>
+<h3 id="synopsis_25">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvfcmp_seq_d (__m256d a, __m256d b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfcmp.seq.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_25">Description</h3>
+<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if EQ(Equal), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>
+<h3 id="operation_25">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (fp_compare_seq(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_25">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvfcmp_sle_s-__m256-a-__m256-b">__m256i __lasx_xvfcmp_sle_s (__m256 a, __m256 b)</h2>
+<h3 id="synopsis_26">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvfcmp_sle_s (__m256 a, __m256 b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfcmp.sle.s xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_26">Description</h3>
+<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if LE(Less than or Equal), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>
+<h3 id="operation_26">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  if (fp_compare_sle(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_26">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvfcmp_sle_d-__m256d-a-__m256d-b">__m256i __lasx_xvfcmp_sle_d (__m256d a, __m256d b)</h2>
+<h3 id="synopsis_27">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvfcmp_sle_d (__m256d a, __m256d b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfcmp.sle.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_27">Description</h3>
+<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if LE(Less than or Equal), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>
+<h3 id="operation_27">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (fp_compare_sle(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_27">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvfcmp_slt_s-__m256-a-__m256-b">__m256i __lasx_xvfcmp_slt_s (__m256 a, __m256 b)</h2>
+<h3 id="synopsis_28">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvfcmp_slt_s (__m256 a, __m256 b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfcmp.slt.s xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_28">Description</h3>
+<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if LT(Less than), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>
+<h3 id="operation_28">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  if (fp_compare_slt(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_28">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvfcmp_slt_d-__m256d-a-__m256d-b">__m256i __lasx_xvfcmp_slt_d (__m256d a, __m256d b)</h2>
+<h3 id="synopsis_29">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvfcmp_slt_d (__m256d a, __m256d b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfcmp.slt.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_29">Description</h3>
+<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if LT(Less than), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>
+<h3 id="operation_29">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (fp_compare_slt(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_29">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvfcmp_sne_s-__m256-a-__m256-b">__m256i __lasx_xvfcmp_sne_s (__m256 a, __m256 b)</h2>
+<h3 id="synopsis_30">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvfcmp_sne_s (__m256 a, __m256 b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfcmp.sne.s xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_30">Description</h3>
+<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if NE(Not Equal), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>
+<h3 id="operation_30">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  if (fp_compare_sne(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_30">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvfcmp_sne_d-__m256d-a-__m256d-b">__m256i __lasx_xvfcmp_sne_d (__m256d a, __m256d b)</h2>
+<h3 id="synopsis_31">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvfcmp_sne_d (__m256d a, __m256d b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfcmp.sne.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_31">Description</h3>
+<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if NE(Not Equal), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>
+<h3 id="operation_31">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (fp_compare_sne(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_31">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvfcmp_sor_s-__m256-a-__m256-b">__m256i __lasx_xvfcmp_sor_s (__m256 a, __m256 b)</h2>
+<h3 id="synopsis_32">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvfcmp_sor_s (__m256 a, __m256 b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfcmp.sor.s xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_32">Description</h3>
+<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if OR(Ordered), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>
+<h3 id="operation_32">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  if (fp_compare_sor(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_32">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvfcmp_sor_d-__m256d-a-__m256d-b">__m256i __lasx_xvfcmp_sor_d (__m256d a, __m256d b)</h2>
+<h3 id="synopsis_33">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvfcmp_sor_d (__m256d a, __m256d b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfcmp.sor.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_33">Description</h3>
+<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if OR(Ordered), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>
+<h3 id="operation_33">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (fp_compare_sor(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_33">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvfcmp_sueq_s-__m256-a-__m256-b">__m256i __lasx_xvfcmp_sueq_s (__m256 a, __m256 b)</h2>
+<h3 id="synopsis_34">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvfcmp_sueq_s (__m256 a, __m256 b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfcmp.sueq.s xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_34">Description</h3>
+<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if UEQ(Unordered or Equal), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>
+<h3 id="operation_34">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  if (fp_compare_sueq(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_34">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvfcmp_sueq_d-__m256d-a-__m256d-b">__m256i __lasx_xvfcmp_sueq_d (__m256d a, __m256d b)</h2>
+<h3 id="synopsis_35">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvfcmp_sueq_d (__m256d a, __m256d b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfcmp.sueq.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_35">Description</h3>
+<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if UEQ(Unordered or Equal), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>
+<h3 id="operation_35">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (fp_compare_sueq(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_35">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvfcmp_sule_s-__m256-a-__m256-b">__m256i __lasx_xvfcmp_sule_s (__m256 a, __m256 b)</h2>
+<h3 id="synopsis_36">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvfcmp_sule_s (__m256 a, __m256 b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfcmp.sule.s xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_36">Description</h3>
+<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if ULE(Unordered, Less than or Equal), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>
+<h3 id="operation_36">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  if (fp_compare_sule(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_36">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvfcmp_sule_d-__m256d-a-__m256d-b">__m256i __lasx_xvfcmp_sule_d (__m256d a, __m256d b)</h2>
+<h3 id="synopsis_37">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvfcmp_sule_d (__m256d a, __m256d b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfcmp.sule.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_37">Description</h3>
+<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if ULE(Unordered, Less than or Equal), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>
+<h3 id="operation_37">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (fp_compare_sule(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_37">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvfcmp_sult_s-__m256-a-__m256-b">__m256i __lasx_xvfcmp_sult_s (__m256 a, __m256 b)</h2>
+<h3 id="synopsis_38">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvfcmp_sult_s (__m256 a, __m256 b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfcmp.sult.s xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_38">Description</h3>
+<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if ULT(Unordered or Less than), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>
+<h3 id="operation_38">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  if (fp_compare_sult(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_38">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvfcmp_sult_d-__m256d-a-__m256d-b">__m256i __lasx_xvfcmp_sult_d (__m256d a, __m256d b)</h2>
+<h3 id="synopsis_39">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvfcmp_sult_d (__m256d a, __m256d b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfcmp.sult.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_39">Description</h3>
+<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if ULT(Unordered or Less than), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>
+<h3 id="operation_39">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (fp_compare_sult(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_39">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvfcmp_sun_s-__m256-a-__m256-b">__m256i __lasx_xvfcmp_sun_s (__m256 a, __m256 b)</h2>
+<h3 id="synopsis_40">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvfcmp_sun_s (__m256 a, __m256 b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfcmp.sun.s xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_40">Description</h3>
+<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if UN(Unordered), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>
+<h3 id="operation_40">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  if (fp_compare_sun(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_40">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvfcmp_sun_d-__m256d-a-__m256d-b">__m256i __lasx_xvfcmp_sun_d (__m256d a, __m256d b)</h2>
+<h3 id="synopsis_41">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvfcmp_sun_d (__m256d a, __m256d b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfcmp.sun.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_41">Description</h3>
+<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if UN(Unordered), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>
+<h3 id="operation_41">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (fp_compare_sun(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_41">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvfcmp_sune_s-__m256-a-__m256-b">__m256i __lasx_xvfcmp_sune_s (__m256 a, __m256 b)</h2>
+<h3 id="synopsis_42">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvfcmp_sune_s (__m256 a, __m256 b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfcmp.sune.s xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_42">Description</h3>
+<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if UNE(Unordered or Not Equal), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>
+<h3 id="operation_42">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  if (fp_compare_sune(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_42">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvfcmp_sune_d-__m256d-a-__m256d-b">__m256i __lasx_xvfcmp_sune_d (__m256d a, __m256d b)</h2>
+<h3 id="synopsis_43">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvfcmp_sune_d (__m256d a, __m256d b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfcmp.sune.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_43">Description</h3>
+<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if UNE(Unordered or Not Equal), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>
+<h3 id="operation_43">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (fp_compare_sune(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_43">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+              
+            </div>
+          </div><footer>
+    <div class="rst-footer-buttons" role="navigation" aria-label="Footer Navigation">
+        <a href="../branch/" class="btn btn-neutral float-left" title="Branch"><span class="icon icon-circle-arrow-left"></span> Previous</a>
+        <a href="../float_computation/" class="btn btn-neutral float-right" title="Floating Point Computation">Next <span class="icon icon-circle-arrow-right"></span></a>
+    </div>
+
+  <hr/>
+
+  <div role="contentinfo">
+    <!-- Copyright etc -->
+      <p>Copyright &copy; 2023 Jiajie Chen</p>
+  </div>
+
+  Built with <a href="https://www.mkdocs.org/">MkDocs</a> using a <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
+</footer>
+          
+        </div>
+      </div>
+
+    </section>
+
+  </div>
+
+  <div class="rst-versions" role="note" aria-label="Versions">
+  <span class="rst-current-version" data-toggle="rst-current-version">
+    
+    
+      <span><a href="../branch/" style="color: #fcfcfc">&laquo; Previous</a></span>
+    
+    
+      <span><a href="../float_computation/" style="color: #fcfcfc">Next &raquo;</a></span>
+    
+  </span>
+</div>
+    <script src="../../js/jquery-3.6.0.min.js"></script>
+    <script>var base_url = "../..";</script>
+    <script src="../../js/theme_extra.js"></script>
+    <script src="../../js/theme.js"></script>
+    <script>
+        jQuery(function () {
+            SphinxRtdTheme.Navigation.enable(true);
+        });
+    </script>
+
+</body>
+</html>
diff --git a/lasx/float_computation/index.html b/lasx/float_computation/index.html
new file mode 100644
index 00000000..f0021902
--- /dev/null
+++ b/lasx/float_computation/index.html
@@ -0,0 +1,1447 @@
+<!DOCTYPE html>
+<html class="writer-html5" lang="en" >
+<head>
+    <meta charset="utf-8" />
+    <meta http-equiv="X-UA-Compatible" content="IE=edge" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" /><link rel="canonical" href="https://jia.je/unofficial-loongarch-intrinsics-guide/lasx/float_computation/" />
+      <link rel="shortcut icon" href="../../img/favicon.ico" />
+    <title>Floating Point Computation - Unofficial LoongArch Intrinsics Guide</title>
+    <link rel="stylesheet" href="../../css/theme.css" />
+    <link rel="stylesheet" href="../../css/theme_extra.css" />
+        <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.8.0/styles/github.min.css" />
+        <link href="../../main.css" rel="stylesheet" />
+    
+      <script>
+        // Current page data
+        var mkdocs_page_name = "Floating Point Computation";
+        var mkdocs_page_input_path = "lasx/float_computation.md";
+        var mkdocs_page_url = "/unofficial-loongarch-intrinsics-guide/lasx/float_computation/";
+      </script>
+    
+    <!--[if lt IE 9]>
+      <script src="../../js/html5shiv.min.js"></script>
+    <![endif]-->
+      <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.8.0/highlight.min.js"></script>
+      <script>hljs.highlightAll();</script> 
+</head>
+
+<body class="wy-body-for-nav" role="document">
+
+  <div class="wy-grid-for-nav">
+    <nav data-toggle="wy-nav-shift" class="wy-nav-side stickynav">
+    <div class="wy-side-scroll">
+      <div class="wy-side-nav-search">
+          <a href="../.." class="icon icon-home"> Unofficial LoongArch Intrinsics Guide
+        </a>
+      </div>
+
+      <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../..">Unofficial LoongArch Intrinsics Guide</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../latency_throughput/">Latency and Throughput of Instructions</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../migrating_avx/">Migrating from AVX to LASX</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../migrating_sse/">Migrating from SSE to LSX</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../viewer/">Browse All Intrinsics</a>
+                </li>
+              </ul>
+              <p class="caption"><span class="caption-text">Lasx</span></p>
+              <ul class="current">
+                  <li class="toctree-l1"><a class="reference internal" href="../bitwise_operations/">Bitwise Operations</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../branch/">Branch</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_comparison/">Floating Point Comparison</a>
+                  </li>
+                  <li class="toctree-l1 current"><a class="reference internal current" href="./">Floating Point Computation</a>
+    <ul class="current">
+    <li class="toctree-l2"><a class="reference internal" href="#__m256-__lasx_xvfadd_s-__m256-a-__m256-b">__m256 __lasx_xvfadd_s (__m256 a, __m256 b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256d-__lasx_xvfadd_d-__m256d-a-__m256d-b">__m256d __lasx_xvfadd_d (__m256d a, __m256d b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_1">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_1">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_1">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_1">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256-__lasx_xvfdiv_s-__m256-a-__m256-b">__m256 __lasx_xvfdiv_s (__m256 a, __m256 b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_2">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_2">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_2">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_2">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256d-__lasx_xvfdiv_d-__m256d-a-__m256d-b">__m256d __lasx_xvfdiv_d (__m256d a, __m256d b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_3">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_3">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_3">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_3">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256-__lasx_xvfmax_s-__m256-a-__m256-b">__m256 __lasx_xvfmax_s (__m256 a, __m256 b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_4">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_4">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_4">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_4">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256d-__lasx_xvfmax_d-__m256d-a-__m256d-b">__m256d __lasx_xvfmax_d (__m256d a, __m256d b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_5">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_5">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_5">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_5">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256-__lasx_xvfmaxa_s-__m256-a-__m256-b">__m256 __lasx_xvfmaxa_s (__m256 a, __m256 b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_6">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_6">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_6">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_6">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256d-__lasx_xvfmaxa_d-__m256d-a-__m256d-b">__m256d __lasx_xvfmaxa_d (__m256d a, __m256d b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_7">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_7">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_7">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_7">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256-__lasx_xvfmin_s-__m256-a-__m256-b">__m256 __lasx_xvfmin_s (__m256 a, __m256 b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_8">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_8">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_8">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_8">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256d-__lasx_xvfmin_d-__m256d-a-__m256d-b">__m256d __lasx_xvfmin_d (__m256d a, __m256d b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_9">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_9">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_9">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_9">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256-__lasx_xvfmina_s-__m256-a-__m256-b">__m256 __lasx_xvfmina_s (__m256 a, __m256 b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_10">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_10">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_10">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_10">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256d-__lasx_xvfmina_d-__m256d-a-__m256d-b">__m256d __lasx_xvfmina_d (__m256d a, __m256d b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_11">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_11">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_11">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_11">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256-__lasx_xvfmul_s-__m256-a-__m256-b">__m256 __lasx_xvfmul_s (__m256 a, __m256 b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_12">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_12">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_12">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_12">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256d-__lasx_xvfmul_d-__m256d-a-__m256d-b">__m256d __lasx_xvfmul_d (__m256d a, __m256d b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_13">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_13">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_13">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_13">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256-__lasx_xvfsub_s-__m256-a-__m256-b">__m256 __lasx_xvfsub_s (__m256 a, __m256 b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_14">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_14">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_14">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_14">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256d-__lasx_xvfsub_d-__m256d-a-__m256d-b">__m256d __lasx_xvfsub_d (__m256d a, __m256d b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_15">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_15">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_15">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_15">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256-__lasx_xvflogb_s-__m256-a">__m256 __lasx_xvflogb_s (__m256 a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_16">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_16">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_16">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_16">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256d-__lasx_xvflogb_d-__m256d-a">__m256d __lasx_xvflogb_d (__m256d a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_17">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_17">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_17">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_17">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256-__lasx_xvfsqrt_s-__m256-a">__m256 __lasx_xvfsqrt_s (__m256 a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_18">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_18">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_18">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_18">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256d-__lasx_xvfsqrt_d-__m256d-a">__m256d __lasx_xvfsqrt_d (__m256d a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_19">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_19">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_19">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_19">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256-__lasx_xvfrsqrt_s-__m256-a">__m256 __lasx_xvfrsqrt_s (__m256 a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_20">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_20">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_20">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_20">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256d-__lasx_xvfrsqrt_d-__m256d-a">__m256d __lasx_xvfrsqrt_d (__m256d a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_21">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_21">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_21">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_21">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256-__lasx_xvfrecip_s-__m256-a">__m256 __lasx_xvfrecip_s (__m256 a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_22">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_22">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_22">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_22">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256d-__lasx_xvfrecip_d-__m256d-a">__m256d __lasx_xvfrecip_d (__m256d a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_23">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_23">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_23">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_23">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256-__lasx_xvfrsqrte_s-__m256-a">__m256 __lasx_xvfrsqrte_s (__m256 a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_24">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_24">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_24">Operation</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256d-__lasx_xvfrsqrte_d-__m256d-a">__m256d __lasx_xvfrsqrte_d (__m256d a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_25">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_25">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_25">Operation</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256-__lasx_xvfrecipe_s-__m256-a">__m256 __lasx_xvfrecipe_s (__m256 a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_26">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_26">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_26">Operation</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256d-__lasx_xvfrecipe_d-__m256d-a">__m256d __lasx_xvfrecipe_d (__m256d a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_27">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_27">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_27">Operation</a>
+    </li>
+        </ul>
+    </li>
+    </ul>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_conversion/">Floating Point Conversion</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_misc/">Floating Point Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../fma/">Fused Multiply-Add</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../integer_comparison/">Integer Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../integer_computation/">Integer Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../logical/">Logical</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../memory/">Memory Load & Store</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../misc/">Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../permutation/">Permutation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../shift/">Shift</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../shuffling/">Shuffling</a>
+                  </li>
+              </ul>
+              <p class="caption"><span class="caption-text">Lsx</span></p>
+              <ul>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/bitwise_operations/">Bitwise Operations</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/branch/">Branch</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/float_comparison/">Floating Point Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/float_computation/">Floating Point Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/float_conversion/">Floating Point Conversion</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/float_misc/">Floating Point Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/fma/">Fused Multiply-Add</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/integer_comparison/">Integer Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/integer_computation/">Integer Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/logical/">Logical</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/memory/">Memory Load & Store</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/misc/">Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/permutation/">Permutation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/shift/">Shift</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/shuffling/">Shuffling</a>
+                  </li>
+              </ul>
+      </div>
+    </div>
+    </nav>
+
+    <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
+      <nav class="wy-nav-top" role="navigation" aria-label="Mobile navigation menu">
+          <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
+          <a href="../..">Unofficial LoongArch Intrinsics Guide</a>
+        
+      </nav>
+      <div class="wy-nav-content">
+        <div class="rst-content"><div role="navigation" aria-label="breadcrumbs navigation">
+  <ul class="wy-breadcrumbs">
+    <li><a href="../.." class="icon icon-home" aria-label="Docs"></a></li>
+          <li class="breadcrumb-item">Lasx</li>
+      <li class="breadcrumb-item active">Floating Point Computation</li>
+    <li class="wy-breadcrumbs-aside">
+    </li>
+  </ul>
+  <hr/>
+</div>
+          <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
+            <div class="section" itemprop="articleBody">
+              
+                <h1 id="floating-point-computation">Floating Point Computation</h1>
+<h2 id="__m256-__lasx_xvfadd_s-__m256-a-__m256-b">__m256 __lasx_xvfadd_s (__m256 a, __m256 b)</h2>
+<h3 id="synopsis">Synopsis</h3>
+<pre><code class="language-c++">__m256 __lasx_xvfadd_s (__m256 a, __m256 b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfadd.s xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description">Description</h3>
+<p>Add single precision floating point elements in <code>a</code> to elements in <code>b</code>.</p>
+<h3 id="operation">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.fp32[i] = a.fp32[i] + b.fp32[i];
+}
+</code></pre>
+<h3 id="latency-and-throughput">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>5</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256d-__lasx_xvfadd_d-__m256d-a-__m256d-b">__m256d __lasx_xvfadd_d (__m256d a, __m256d b)</h2>
+<h3 id="synopsis_1">Synopsis</h3>
+<pre><code class="language-c++">__m256d __lasx_xvfadd_d (__m256d a, __m256d b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfadd.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_1">Description</h3>
+<p>Add double precision floating point elements in <code>a</code> to elements in <code>b</code>.</p>
+<h3 id="operation_1">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.fp64[i] = a.fp64[i] + b.fp64[i];
+}
+</code></pre>
+<h3 id="latency-and-throughput_1">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>5</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256-__lasx_xvfdiv_s-__m256-a-__m256-b">__m256 __lasx_xvfdiv_s (__m256 a, __m256 b)</h2>
+<h3 id="synopsis_2">Synopsis</h3>
+<pre><code class="language-c++">__m256 __lasx_xvfdiv_s (__m256 a, __m256 b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfdiv.s xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_2">Description</h3>
+<p>Divide single precision floating point elements in <code>a</code> by elements in <code>b</code>.</p>
+<h3 id="operation_2">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.fp32[i] = a.fp32[i] / b.fp32[i];
+}
+</code></pre>
+<h3 id="latency-and-throughput_2">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>11</td>
+<td>0.18(1/5.5)</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>11, 19.5</td>
+<td>0.1(1/10.5)</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256d-__lasx_xvfdiv_d-__m256d-a-__m256d-b">__m256d __lasx_xvfdiv_d (__m256d a, __m256d b)</h2>
+<h3 id="synopsis_3">Synopsis</h3>
+<pre><code class="language-c++">__m256d __lasx_xvfdiv_d (__m256d a, __m256d b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfdiv.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_3">Description</h3>
+<p>Divide double precision floating point elements in <code>a</code> by elements in <code>b</code>.</p>
+<h3 id="operation_3">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.fp64[i] = a.fp64[i] / b.fp64[i];
+}
+</code></pre>
+<h3 id="latency-and-throughput_3">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>8, 21.5</td>
+<td>0.25(1/4)</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>8, 17</td>
+<td>0.08(1/12.5)</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256-__lasx_xvfmax_s-__m256-a-__m256-b">__m256 __lasx_xvfmax_s (__m256 a, __m256 b)</h2>
+<h3 id="synopsis_4">Synopsis</h3>
+<pre><code class="language-c++">__m256 __lasx_xvfmax_s (__m256 a, __m256 b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfmax.s xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_4">Description</h3>
+<p>Compute maximum of single precision floating point elements in <code>a</code> and <code>b</code>.</p>
+<h3 id="operation_4">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.fp32[i] = fmax(a.fp32[i], b.fp32[i]);
+}
+</code></pre>
+<h3 id="latency-and-throughput_4">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256d-__lasx_xvfmax_d-__m256d-a-__m256d-b">__m256d __lasx_xvfmax_d (__m256d a, __m256d b)</h2>
+<h3 id="synopsis_5">Synopsis</h3>
+<pre><code class="language-c++">__m256d __lasx_xvfmax_d (__m256d a, __m256d b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfmax.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_5">Description</h3>
+<p>Compute maximum of double precision floating point elements in <code>a</code> and <code>b</code>.</p>
+<h3 id="operation_5">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.fp64[i] = fmax(a.fp64[i], b.fp64[i]);
+}
+</code></pre>
+<h3 id="latency-and-throughput_5">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256-__lasx_xvfmaxa_s-__m256-a-__m256-b">__m256 __lasx_xvfmaxa_s (__m256 a, __m256 b)</h2>
+<h3 id="synopsis_6">Synopsis</h3>
+<pre><code class="language-c++">__m256 __lasx_xvfmaxa_s (__m256 a, __m256 b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfmaxa.s xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_6">Description</h3>
+<p>Compute maximum of single precision floating point elements in <code>a</code> and <code>b</code> by magnitude.</p>
+<h3 id="operation_6">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.fp32[i] = (abs(a.fp32[i]) &gt; abs(b.fp32[i])) ? a.fp32[i] : b.fp32[i];
+}
+</code></pre>
+<h3 id="latency-and-throughput_6">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256d-__lasx_xvfmaxa_d-__m256d-a-__m256d-b">__m256d __lasx_xvfmaxa_d (__m256d a, __m256d b)</h2>
+<h3 id="synopsis_7">Synopsis</h3>
+<pre><code class="language-c++">__m256d __lasx_xvfmaxa_d (__m256d a, __m256d b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfmaxa.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_7">Description</h3>
+<p>Compute maximum of double precision floating point elements in <code>a</code> and <code>b</code> by magnitude.</p>
+<h3 id="operation_7">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.fp64[i] = (abs(a.fp64[i]) &gt; abs(b.fp64[i])) ? a.fp64[i] : b.fp64[i];
+}
+</code></pre>
+<h3 id="latency-and-throughput_7">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256-__lasx_xvfmin_s-__m256-a-__m256-b">__m256 __lasx_xvfmin_s (__m256 a, __m256 b)</h2>
+<h3 id="synopsis_8">Synopsis</h3>
+<pre><code class="language-c++">__m256 __lasx_xvfmin_s (__m256 a, __m256 b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfmax.s xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_8">Description</h3>
+<p>Compute minimum of single precision floating point elements in <code>a</code> and <code>b</code>.</p>
+<h3 id="operation_8">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.fp32[i] = fmin(a.fp32[i], b.fp32[i]);
+}
+</code></pre>
+<h3 id="latency-and-throughput_8">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256d-__lasx_xvfmin_d-__m256d-a-__m256d-b">__m256d __lasx_xvfmin_d (__m256d a, __m256d b)</h2>
+<h3 id="synopsis_9">Synopsis</h3>
+<pre><code class="language-c++">__m256d __lasx_xvfmin_d (__m256d a, __m256d b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfmax.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_9">Description</h3>
+<p>Compute minimum of double precision floating point elements in <code>a</code> and <code>b</code>.</p>
+<h3 id="operation_9">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.fp64[i] = fmin(a.fp64[i], b.fp64[i]);
+}
+</code></pre>
+<h3 id="latency-and-throughput_9">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256-__lasx_xvfmina_s-__m256-a-__m256-b">__m256 __lasx_xvfmina_s (__m256 a, __m256 b)</h2>
+<h3 id="synopsis_10">Synopsis</h3>
+<pre><code class="language-c++">__m256 __lasx_xvfmina_s (__m256 a, __m256 b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfmina.s xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_10">Description</h3>
+<p>Compute minimum of single precision floating point elements in <code>a</code> and <code>b</code> by magnitude.</p>
+<h3 id="operation_10">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.fp32[i] = (abs(a.fp32[i]) &lt; abs(b.fp32[i])) ? a.fp32[i] : b.fp32[i];
+}
+</code></pre>
+<h3 id="latency-and-throughput_10">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256d-__lasx_xvfmina_d-__m256d-a-__m256d-b">__m256d __lasx_xvfmina_d (__m256d a, __m256d b)</h2>
+<h3 id="synopsis_11">Synopsis</h3>
+<pre><code class="language-c++">__m256d __lasx_xvfmina_d (__m256d a, __m256d b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfmina.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_11">Description</h3>
+<p>Compute minimum of double precision floating point elements in <code>a</code> and <code>b</code> by magnitude.</p>
+<h3 id="operation_11">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.fp64[i] = (abs(a.fp64[i]) &lt; abs(b.fp64[i])) ? a.fp64[i] : b.fp64[i];
+}
+</code></pre>
+<h3 id="latency-and-throughput_11">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256-__lasx_xvfmul_s-__m256-a-__m256-b">__m256 __lasx_xvfmul_s (__m256 a, __m256 b)</h2>
+<h3 id="synopsis_12">Synopsis</h3>
+<pre><code class="language-c++">__m256 __lasx_xvfmul_s (__m256 a, __m256 b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfmul.s xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_12">Description</h3>
+<p>Multiply single precision floating point elements in <code>a</code> and elements in <code>b</code>.</p>
+<h3 id="operation_12">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.fp32[i] = a.fp32[i] * b.fp32[i];
+}
+</code></pre>
+<h3 id="latency-and-throughput_12">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>5</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>5</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256d-__lasx_xvfmul_d-__m256d-a-__m256d-b">__m256d __lasx_xvfmul_d (__m256d a, __m256d b)</h2>
+<h3 id="synopsis_13">Synopsis</h3>
+<pre><code class="language-c++">__m256d __lasx_xvfmul_d (__m256d a, __m256d b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfmul.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_13">Description</h3>
+<p>Multiply double precision floating point elements in <code>a</code> and elements in <code>b</code>.</p>
+<h3 id="operation_13">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.fp64[i] = a.fp64[i] * b.fp64[i];
+}
+</code></pre>
+<h3 id="latency-and-throughput_13">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>5</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>5</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256-__lasx_xvfsub_s-__m256-a-__m256-b">__m256 __lasx_xvfsub_s (__m256 a, __m256 b)</h2>
+<h3 id="synopsis_14">Synopsis</h3>
+<pre><code class="language-c++">__m256 __lasx_xvfsub_s (__m256 a, __m256 b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfsub.s xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_14">Description</h3>
+<p>Subtract single precision floating point elements in <code>a</code> by elements in <code>b</code>.</p>
+<h3 id="operation_14">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.fp32[i] = a.fp32[i] - b.fp32[i];
+}
+</code></pre>
+<h3 id="latency-and-throughput_14">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>5</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256d-__lasx_xvfsub_d-__m256d-a-__m256d-b">__m256d __lasx_xvfsub_d (__m256d a, __m256d b)</h2>
+<h3 id="synopsis_15">Synopsis</h3>
+<pre><code class="language-c++">__m256d __lasx_xvfsub_d (__m256d a, __m256d b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfsub.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_15">Description</h3>
+<p>Subtract double precision floating point elements in <code>a</code> by elements in <code>b</code>.</p>
+<h3 id="operation_15">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.fp64[i] = a.fp64[i] - b.fp64[i];
+}
+</code></pre>
+<h3 id="latency-and-throughput_15">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>5</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256-__lasx_xvflogb_s-__m256-a">__m256 __lasx_xvflogb_s (__m256 a)</h2>
+<h3 id="synopsis_16">Synopsis</h3>
+<pre><code class="language-c++">__m256 __lasx_xvflogb_s (__m256 a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvflogb.s xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_16">Description</h3>
+<p>Compute 2-based logarithm of single precision floating point elements in <code>a</code>.</p>
+<h3 id="operation_16">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.fp32[i] = log2(a.fp32[i]);
+}
+</code></pre>
+<h3 id="latency-and-throughput_16">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256d-__lasx_xvflogb_d-__m256d-a">__m256d __lasx_xvflogb_d (__m256d a)</h2>
+<h3 id="synopsis_17">Synopsis</h3>
+<pre><code class="language-c++">__m256d __lasx_xvflogb_d (__m256d a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvflogb.d xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_17">Description</h3>
+<p>Compute 2-based logarithm of double precision floating point elements in <code>a</code>.</p>
+<h3 id="operation_17">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.fp64[i] = log2(a.fp64[i]);
+}
+</code></pre>
+<h3 id="latency-and-throughput_17">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256-__lasx_xvfsqrt_s-__m256-a">__m256 __lasx_xvfsqrt_s (__m256 a)</h2>
+<h3 id="synopsis_18">Synopsis</h3>
+<pre><code class="language-c++">__m256 __lasx_xvfsqrt_s (__m256 a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfsqrt.s xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_18">Description</h3>
+<p>Compute square root of single precision floating point elements in <code>a</code>.</p>
+<h3 id="operation_18">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.fp32[i] = sqrt(a.fp32[i]);
+}
+</code></pre>
+<h3 id="latency-and-throughput_18">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>15</td>
+<td>0.08(1/12)</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>15</td>
+<td>0.07(1/13.5)</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256d-__lasx_xvfsqrt_d-__m256d-a">__m256d __lasx_xvfsqrt_d (__m256d a)</h2>
+<h3 id="synopsis_19">Synopsis</h3>
+<pre><code class="language-c++">__m256d __lasx_xvfsqrt_d (__m256d a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfsqrt.d xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_19">Description</h3>
+<p>Compute square root of double precision floating point elements in <code>a</code>.</p>
+<h3 id="operation_19">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.fp64[i] = sqrt(a.fp64[i]);
+}
+</code></pre>
+<h3 id="latency-and-throughput_19">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>36</td>
+<td>0.06(1/17.5)</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>36</td>
+<td>0.05(1/18.5)</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256-__lasx_xvfrsqrt_s-__m256-a">__m256 __lasx_xvfrsqrt_s (__m256 a)</h2>
+<h3 id="synopsis_20">Synopsis</h3>
+<pre><code class="language-c++">__m256 __lasx_xvfrsqrt_s (__m256 a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfrsqrt.s xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_20">Description</h3>
+<p>Compute reciprocal of square root of single precision floating point elements in <code>a</code>.</p>
+<h3 id="operation_20">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.fp32[i] = 1.0 / sqrt(a.fp32[i]);
+}
+</code></pre>
+<h3 id="latency-and-throughput_20">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>25</td>
+<td>0.05(1/19)</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>25</td>
+<td>0.03(1/32)</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256d-__lasx_xvfrsqrt_d-__m256d-a">__m256d __lasx_xvfrsqrt_d (__m256d a)</h2>
+<h3 id="synopsis_21">Synopsis</h3>
+<pre><code class="language-c++">__m256d __lasx_xvfrsqrt_d (__m256d a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfrsqrt.d xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_21">Description</h3>
+<p>Compute reciprocal of square root of double precision floating point elements in <code>a</code>.</p>
+<h3 id="operation_21">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.fp64[i] = 1.0 / sqrt(a.fp64[i]);
+}
+</code></pre>
+<h3 id="latency-and-throughput_21">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>15</td>
+<td>0.04(1/26.5)</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>15</td>
+<td>0.04(1/27.5)</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256-__lasx_xvfrecip_s-__m256-a">__m256 __lasx_xvfrecip_s (__m256 a)</h2>
+<h3 id="synopsis_22">Synopsis</h3>
+<pre><code class="language-c++">__m256 __lasx_xvfrecip_s (__m256 a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfrecip.s xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_22">Description</h3>
+<p>Compute reciprocal of single precision floating point elements in <code>a</code>.</p>
+<h3 id="operation_22">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.fp32[i] = 1 / a.fp32[i];
+}
+</code></pre>
+<h3 id="latency-and-throughput_22">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>27</td>
+<td>0.18(1/5.5)</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>27</td>
+<td>0.14(1/7)</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256d-__lasx_xvfrecip_d-__m256d-a">__m256d __lasx_xvfrecip_d (__m256d a)</h2>
+<h3 id="synopsis_23">Synopsis</h3>
+<pre><code class="language-c++">__m256d __lasx_xvfrecip_d (__m256d a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfrecip.d xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_23">Description</h3>
+<p>Compute reciprocal of double precision floating point elements in <code>a</code>.</p>
+<h3 id="operation_23">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.fp64[i] = 1 / a.fp64[i];
+}
+</code></pre>
+<h3 id="latency-and-throughput_23">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>23</td>
+<td>0.25(1/4)</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>23</td>
+<td>0.08(1/12)</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256-__lasx_xvfrsqrte_s-__m256-a">__m256 __lasx_xvfrsqrte_s (__m256 a)</h2>
+<h3 id="synopsis_24">Synopsis</h3>
+<pre><code class="language-c++">__m256 __lasx_xvfrsqrte_s (__m256 a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfrsqrte.s xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_24">Description</h3>
+<p>Compute estimated reciprocal of square root of single precision floating point elements in <code>a</code>.</p>
+<h3 id="operation_24">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.fp32[i] = 1.0 / sqrt(a.fp32[i]); // estimated
+}
+</code></pre>
+<h2 id="__m256d-__lasx_xvfrsqrte_d-__m256d-a">__m256d __lasx_xvfrsqrte_d (__m256d a)</h2>
+<h3 id="synopsis_25">Synopsis</h3>
+<pre><code class="language-c++">__m256d __lasx_xvfrsqrte_d (__m256d a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfrsqrte.d xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_25">Description</h3>
+<p>Compute estimated reciprocal of square root of double precision floating point elements in <code>a</code>.</p>
+<h3 id="operation_25">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.fp64[i] = 1.0 / sqrt(a.fp64[i]); // estimated
+}
+</code></pre>
+<h2 id="__m256-__lasx_xvfrecipe_s-__m256-a">__m256 __lasx_xvfrecipe_s (__m256 a)</h2>
+<h3 id="synopsis_26">Synopsis</h3>
+<pre><code class="language-c++">__m256 __lasx_xvfrecipe_s (__m256 a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfrecipe.s xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_26">Description</h3>
+<p>Compute estimated reciprocal of single precision floating point elements in <code>a</code>.</p>
+<h3 id="operation_26">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.fp32[i] = 1 / a.fp32[i]; // estimated
+}
+</code></pre>
+<h2 id="__m256d-__lasx_xvfrecipe_d-__m256d-a">__m256d __lasx_xvfrecipe_d (__m256d a)</h2>
+<h3 id="synopsis_27">Synopsis</h3>
+<pre><code class="language-c++">__m256d __lasx_xvfrecipe_d (__m256d a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfrecipe.d xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_27">Description</h3>
+<p>Compute estimated reciprocal of double precision floating point elements in <code>a</code>.</p>
+<h3 id="operation_27">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.fp64[i] = 1 / a.fp64[i]; // estimated
+}
+</code></pre>
+              
+            </div>
+          </div><footer>
+    <div class="rst-footer-buttons" role="navigation" aria-label="Footer Navigation">
+        <a href="../float_comparison/" class="btn btn-neutral float-left" title="Floating Point Comparison"><span class="icon icon-circle-arrow-left"></span> Previous</a>
+        <a href="../float_conversion/" class="btn btn-neutral float-right" title="Floating Point Conversion">Next <span class="icon icon-circle-arrow-right"></span></a>
+    </div>
+
+  <hr/>
+
+  <div role="contentinfo">
+    <!-- Copyright etc -->
+      <p>Copyright &copy; 2023 Jiajie Chen</p>
+  </div>
+
+  Built with <a href="https://www.mkdocs.org/">MkDocs</a> using a <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
+</footer>
+          
+        </div>
+      </div>
+
+    </section>
+
+  </div>
+
+  <div class="rst-versions" role="note" aria-label="Versions">
+  <span class="rst-current-version" data-toggle="rst-current-version">
+    
+    
+      <span><a href="../float_comparison/" style="color: #fcfcfc">&laquo; Previous</a></span>
+    
+    
+      <span><a href="../float_conversion/" style="color: #fcfcfc">Next &raquo;</a></span>
+    
+  </span>
+</div>
+    <script src="../../js/jquery-3.6.0.min.js"></script>
+    <script>var base_url = "../..";</script>
+    <script src="../../js/theme_extra.js"></script>
+    <script src="../../js/theme.js"></script>
+    <script>
+        jQuery(function () {
+            SphinxRtdTheme.Navigation.enable(true);
+        });
+    </script>
+
+</body>
+</html>
diff --git a/lasx/float_conversion/index.html b/lasx/float_conversion/index.html
new file mode 100644
index 00000000..fce6577b
--- /dev/null
+++ b/lasx/float_conversion/index.html
@@ -0,0 +1,2235 @@
+<!DOCTYPE html>
+<html class="writer-html5" lang="en" >
+<head>
+    <meta charset="utf-8" />
+    <meta http-equiv="X-UA-Compatible" content="IE=edge" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" /><link rel="canonical" href="https://jia.je/unofficial-loongarch-intrinsics-guide/lasx/float_conversion/" />
+      <link rel="shortcut icon" href="../../img/favicon.ico" />
+    <title>Floating Point Conversion - Unofficial LoongArch Intrinsics Guide</title>
+    <link rel="stylesheet" href="../../css/theme.css" />
+    <link rel="stylesheet" href="../../css/theme_extra.css" />
+        <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.8.0/styles/github.min.css" />
+        <link href="../../main.css" rel="stylesheet" />
+    
+      <script>
+        // Current page data
+        var mkdocs_page_name = "Floating Point Conversion";
+        var mkdocs_page_input_path = "lasx/float_conversion.md";
+        var mkdocs_page_url = "/unofficial-loongarch-intrinsics-guide/lasx/float_conversion/";
+      </script>
+    
+    <!--[if lt IE 9]>
+      <script src="../../js/html5shiv.min.js"></script>
+    <![endif]-->
+      <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.8.0/highlight.min.js"></script>
+      <script>hljs.highlightAll();</script> 
+</head>
+
+<body class="wy-body-for-nav" role="document">
+
+  <div class="wy-grid-for-nav">
+    <nav data-toggle="wy-nav-shift" class="wy-nav-side stickynav">
+    <div class="wy-side-scroll">
+      <div class="wy-side-nav-search">
+          <a href="../.." class="icon icon-home"> Unofficial LoongArch Intrinsics Guide
+        </a>
+      </div>
+
+      <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../..">Unofficial LoongArch Intrinsics Guide</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../latency_throughput/">Latency and Throughput of Instructions</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../migrating_avx/">Migrating from AVX to LASX</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../migrating_sse/">Migrating from SSE to LSX</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../viewer/">Browse All Intrinsics</a>
+                </li>
+              </ul>
+              <p class="caption"><span class="caption-text">Lasx</span></p>
+              <ul class="current">
+                  <li class="toctree-l1"><a class="reference internal" href="../bitwise_operations/">Bitwise Operations</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../branch/">Branch</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_comparison/">Floating Point Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_computation/">Floating Point Computation</a>
+                  </li>
+                  <li class="toctree-l1 current"><a class="reference internal current" href="./">Floating Point Conversion</a>
+    <ul class="current">
+    <li class="toctree-l2"><a class="reference internal" href="#__m256d-__lasx_xvfcvth_d_s-__m256-a">__m256d __lasx_xvfcvth_d_s (__m256 a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256d-__lasx_xvfcvtl_d_s-__m256-a">__m256d __lasx_xvfcvtl_d_s (__m256 a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_1">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_1">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_1">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_1">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256-__lasx_xvfcvt_s_d-__m256d-a-__m256d-b">__m256 __lasx_xvfcvt_s_d (__m256d a, __m256d b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_2">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_2">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_2">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_2">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256-__lasx_xvfcvth_s_h-__m256i-a">__m256 __lasx_xvfcvth_s_h (__m256i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_3">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_3">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_3">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_3">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256-__lasx_xvfcvtl_s_h-__m256i-a">__m256 __lasx_xvfcvtl_s_h (__m256i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_4">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_4">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_4">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_4">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvfcvt_h_s-__m256-a-__m256-b">__m256i __lasx_xvfcvt_h_s (__m256 a, __m256 b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_5">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_5">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_5">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_5">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256d-__lasx_xvffinth_d_w-__m256i-a">__m256d __lasx_xvffinth_d_w (__m256i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_6">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_6">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_6">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_6">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256d-__lasx_xvffintl_d_w-__m256i-a">__m256d __lasx_xvffintl_d_w (__m256i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_7">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_7">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_7">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_7">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256d-__lasx_xvffint_d_l-__m256i-a">__m256d __lasx_xvffint_d_l (__m256i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_8">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_8">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_8">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_8">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256d-__lasx_xvffint_d_lu-__m256i-a">__m256d __lasx_xvffint_d_lu (__m256i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_9">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_9">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_9">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_9">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256-__lasx_xvffint_s_w-__m256i-a">__m256 __lasx_xvffint_s_w (__m256i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_10">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_10">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_10">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_10">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256-__lasx_xvffint_s_wu-__m256i-a">__m256 __lasx_xvffint_s_wu (__m256i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_11">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_11">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_11">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_11">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256-__lasx_xvffint_s_l-__m256i-a-__m256i-b">__m256 __lasx_xvffint_s_l (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_12">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_12">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_12">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_12">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvftintl_l_s-__m256-a">__m256i __lasx_xvftintl_l_s (__m256 a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_13">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_13">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_13">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_13">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvftinth_l_s-__m256-a">__m256i __lasx_xvftinth_l_s (__m256 a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_14">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_14">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_14">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_14">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvftintrml_l_s-__m256-a">__m256i __lasx_xvftintrml_l_s (__m256 a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_15">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_15">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_15">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_15">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvftintrmh_l_s-__m256-a">__m256i __lasx_xvftintrmh_l_s (__m256 a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_16">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_16">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_16">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_16">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvftintrpl_l_s-__m256-a">__m256i __lasx_xvftintrpl_l_s (__m256 a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_17">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_17">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_17">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_17">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvftintrph_l_s-__m256-a">__m256i __lasx_xvftintrph_l_s (__m256 a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_18">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_18">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_18">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_18">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvftintrzl_l_s-__m256-a">__m256i __lasx_xvftintrzl_l_s (__m256 a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_19">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_19">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_19">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_19">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvftintrzh_l_s-__m256-a">__m256i __lasx_xvftintrzh_l_s (__m256 a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_20">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_20">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_20">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_20">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvftintrnel_l_s-__m256-a">__m256i __lasx_xvftintrnel_l_s (__m256 a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_21">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_21">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_21">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_21">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvftintrneh_l_s-__m256-a">__m256i __lasx_xvftintrneh_l_s (__m256 a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_22">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_22">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_22">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_22">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvftint_l_d-__m256d-a">__m256i __lasx_xvftint_l_d (__m256d a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_23">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_23">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_23">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_23">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvftint_w_s-__m256-a">__m256i __lasx_xvftint_w_s (__m256 a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_24">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_24">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_24">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_24">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvftintrm_l_d-__m256d-a">__m256i __lasx_xvftintrm_l_d (__m256d a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_25">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_25">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_25">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_25">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvftintrm_w_s-__m256-a">__m256i __lasx_xvftintrm_w_s (__m256 a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_26">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_26">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_26">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_26">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvftintrp_l_d-__m256d-a">__m256i __lasx_xvftintrp_l_d (__m256d a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_27">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_27">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_27">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_27">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvftintrp_w_s-__m256-a">__m256i __lasx_xvftintrp_w_s (__m256 a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_28">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_28">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_28">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_28">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvftintrz_l_d-__m256d-a">__m256i __lasx_xvftintrz_l_d (__m256d a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_29">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_29">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_29">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_29">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvftintrz_w_s-__m256-a">__m256i __lasx_xvftintrz_w_s (__m256 a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_30">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_30">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_30">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_30">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvftintrne_l_d-__m256d-a">__m256i __lasx_xvftintrne_l_d (__m256d a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_31">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_31">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_31">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_31">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvftintrne_w_s-__m256-a">__m256i __lasx_xvftintrne_w_s (__m256 a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_32">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_32">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_32">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_32">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvftint_lu_d-__m256d-a">__m256i __lasx_xvftint_lu_d (__m256d a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_33">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_33">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_33">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_33">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvftint_wu_s-__m256-a">__m256i __lasx_xvftint_wu_s (__m256 a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_34">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_34">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_34">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_34">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvftintrz_lu_d-__m256d-a">__m256i __lasx_xvftintrz_lu_d (__m256d a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_35">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_35">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_35">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_35">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvftintrz_wu_s-__m256-a">__m256i __lasx_xvftintrz_wu_s (__m256 a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_36">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_36">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_36">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_36">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvftint_w_d-__m256d-a-__m256d-b">__m256i __lasx_xvftint_w_d (__m256d a, __m256d b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_37">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_37">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_37">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_37">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvftintrm_w_d-__m256d-a-__m256d-b">__m256i __lasx_xvftintrm_w_d (__m256d a, __m256d b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_38">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_38">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_38">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_38">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvftintrp_w_d-__m256d-a-__m256d-b">__m256i __lasx_xvftintrp_w_d (__m256d a, __m256d b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_39">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_39">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_39">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_39">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvftintrz_w_d-__m256d-a-__m256d-b">__m256i __lasx_xvftintrz_w_d (__m256d a, __m256d b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_40">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_40">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_40">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_40">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvftintrne_w_d-__m256d-a-__m256d-b">__m256i __lasx_xvftintrne_w_d (__m256d a, __m256d b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_41">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_41">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_41">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_41">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    </ul>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_misc/">Floating Point Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../fma/">Fused Multiply-Add</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../integer_comparison/">Integer Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../integer_computation/">Integer Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../logical/">Logical</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../memory/">Memory Load & Store</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../misc/">Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../permutation/">Permutation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../shift/">Shift</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../shuffling/">Shuffling</a>
+                  </li>
+              </ul>
+              <p class="caption"><span class="caption-text">Lsx</span></p>
+              <ul>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/bitwise_operations/">Bitwise Operations</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/branch/">Branch</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/float_comparison/">Floating Point Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/float_computation/">Floating Point Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/float_conversion/">Floating Point Conversion</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/float_misc/">Floating Point Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/fma/">Fused Multiply-Add</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/integer_comparison/">Integer Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/integer_computation/">Integer Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/logical/">Logical</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/memory/">Memory Load & Store</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/misc/">Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/permutation/">Permutation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/shift/">Shift</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/shuffling/">Shuffling</a>
+                  </li>
+              </ul>
+      </div>
+    </div>
+    </nav>
+
+    <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
+      <nav class="wy-nav-top" role="navigation" aria-label="Mobile navigation menu">
+          <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
+          <a href="../..">Unofficial LoongArch Intrinsics Guide</a>
+        
+      </nav>
+      <div class="wy-nav-content">
+        <div class="rst-content"><div role="navigation" aria-label="breadcrumbs navigation">
+  <ul class="wy-breadcrumbs">
+    <li><a href="../.." class="icon icon-home" aria-label="Docs"></a></li>
+          <li class="breadcrumb-item">Lasx</li>
+      <li class="breadcrumb-item active">Floating Point Conversion</li>
+    <li class="wy-breadcrumbs-aside">
+    </li>
+  </ul>
+  <hr/>
+</div>
+          <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
+            <div class="section" itemprop="articleBody">
+              
+                <h1 id="floating-point-conversion">Floating Point Conversion</h1>
+<h2 id="__m256d-__lasx_xvfcvth_d_s-__m256-a">__m256d __lasx_xvfcvth_d_s (__m256 a)</h2>
+<h3 id="synopsis">Synopsis</h3>
+<pre><code class="language-c++">__m256d __lasx_xvfcvth_d_s (__m256 a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfcvth.d.s xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description">Description</h3>
+<p>Convert single precision floating point elements in higher half of <code>a</code> to double precision.</p>
+<h3 id="operation">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.fp64[i] = a.fp32[4 + i];
+}
+</code></pre>
+<h3 id="latency-and-throughput">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256d-__lasx_xvfcvtl_d_s-__m256-a">__m256d __lasx_xvfcvtl_d_s (__m256 a)</h2>
+<h3 id="synopsis_1">Synopsis</h3>
+<pre><code class="language-c++">__m256d __lasx_xvfcvtl_d_s (__m256 a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfcvtl.d.s xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_1">Description</h3>
+<p>Convert single precision floating point elements in lower half of <code>a</code> to double precision.</p>
+<h3 id="operation_1">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.fp64[i] = a.fp32[i];
+}
+</code></pre>
+<h3 id="latency-and-throughput_1">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256-__lasx_xvfcvt_s_d-__m256d-a-__m256d-b">__m256 __lasx_xvfcvt_s_d (__m256d a, __m256d b)</h2>
+<h3 id="synopsis_2">Synopsis</h3>
+<pre><code class="language-c++">__m256 __lasx_xvfcvt_s_d (__m256d a, __m256d b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfcvt.s.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_2">Description</h3>
+<p>Convert double precision floating point elements in <code>a</code> and <code>b</code> to single precision.</p>
+<h3 id="operation_2">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  if (i &lt; 4) {
+    dst.fp32[i] = b.fp64[i];
+  } else {
+    dst.fp32[i] = a.fp64[i - 4];
+  }
+}
+</code></pre>
+<h3 id="latency-and-throughput_2">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256-__lasx_xvfcvth_s_h-__m256i-a">__m256 __lasx_xvfcvth_s_h (__m256i a)</h2>
+<h3 id="synopsis_3">Synopsis</h3>
+<pre><code class="language-c++">__m256 __lasx_xvfcvth_s_h (__m256i a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfcvth.s.h xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_3">Description</h3>
+<p>Convert half precision floating point elements in higher half of <code>a</code> to single precision.</p>
+<h3 id="operation_3">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.fp32[i] = a.fp16[8 + i];
+}
+</code></pre>
+<h3 id="latency-and-throughput_3">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256-__lasx_xvfcvtl_s_h-__m256i-a">__m256 __lasx_xvfcvtl_s_h (__m256i a)</h2>
+<h3 id="synopsis_4">Synopsis</h3>
+<pre><code class="language-c++">__m256 __lasx_xvfcvtl_s_h (__m256i a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfcvtl.s.h xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_4">Description</h3>
+<p>Convert half precision floating point elements in lower half of <code>a</code> to single precision.</p>
+<h3 id="operation_4">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.fp32[i] = a.fp16[i];
+}
+</code></pre>
+<h3 id="latency-and-throughput_4">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvfcvt_h_s-__m256-a-__m256-b">__m256i __lasx_xvfcvt_h_s (__m256 a, __m256 b)</h2>
+<h3 id="synopsis_5">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvfcvt_h_s (__m256 a, __m256 b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfcvt.h.s xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_5">Description</h3>
+<p>Convert single precision floating point elements in <code>a</code> and <code>b</code> to half precision.</p>
+<h3 id="operation_5">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  if (i &lt; 8) {
+    dst.fp16[i] = b.fp32[i];
+  } else {
+    dst.fp16[i] = a.fp32[i - 8];
+  }
+}
+</code></pre>
+<h3 id="latency-and-throughput_5">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256d-__lasx_xvffinth_d_w-__m256i-a">__m256d __lasx_xvffinth_d_w (__m256i a)</h2>
+<h3 id="synopsis_6">Synopsis</h3>
+<pre><code class="language-c++">__m256d __lasx_xvffinth_d_w (__m256i a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvffinth.d.w xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_6">Description</h3>
+<p>Convert 32-bit integer elements in higher part of <code>a</code> to double precision floating point numbers.</p>
+<h3 id="operation_6">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.fp64[i] = (f64)(s32)a.word[i + 4]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_6">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>5</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>5</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256d-__lasx_xvffintl_d_w-__m256i-a">__m256d __lasx_xvffintl_d_w (__m256i a)</h2>
+<h3 id="synopsis_7">Synopsis</h3>
+<pre><code class="language-c++">__m256d __lasx_xvffintl_d_w (__m256i a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvffintl.d.w xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_7">Description</h3>
+<p>Convert 32-bit integer elements in lower part of <code>a</code> to double precision floating point numbers.</p>
+<h3 id="operation_7">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.fp64[i] = (f64)(s32)a.word[i]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_7">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>5</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>5</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256d-__lasx_xvffint_d_l-__m256i-a">__m256d __lasx_xvffint_d_l (__m256i a)</h2>
+<h3 id="synopsis_8">Synopsis</h3>
+<pre><code class="language-c++">__m256d __lasx_xvffint_d_l (__m256i a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvffint.d.l xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_8">Description</h3>
+<p>Convert signed 64-bit integer elements in <code>a</code> to double-precision floating point numbers.</p>
+<h3 id="operation_8">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.fp64[i] = (f64)(s64)a.dword[i]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_8">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256d-__lasx_xvffint_d_lu-__m256i-a">__m256d __lasx_xvffint_d_lu (__m256i a)</h2>
+<h3 id="synopsis_9">Synopsis</h3>
+<pre><code class="language-c++">__m256d __lasx_xvffint_d_lu (__m256i a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvffint.d.lu xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_9">Description</h3>
+<p>Convert unsigned 64-bit integer elements in <code>a</code> to double-precision floating point numbers.</p>
+<h3 id="operation_9">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.fp64[i] = (f64)(u64)a.dword[i]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_9">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256-__lasx_xvffint_s_w-__m256i-a">__m256 __lasx_xvffint_s_w (__m256i a)</h2>
+<h3 id="synopsis_10">Synopsis</h3>
+<pre><code class="language-c++">__m256 __lasx_xvffint_s_w (__m256i a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvffint.s.w xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_10">Description</h3>
+<p>Convert signed 32-bit integer elements in <code>a</code> to single-precision floating point numbers.</p>
+<h3 id="operation_10">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.fp32[i] = (f32)(s32)a.word[i]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_10">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256-__lasx_xvffint_s_wu-__m256i-a">__m256 __lasx_xvffint_s_wu (__m256i a)</h2>
+<h3 id="synopsis_11">Synopsis</h3>
+<pre><code class="language-c++">__m256 __lasx_xvffint_s_wu (__m256i a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvffint.s.wu xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_11">Description</h3>
+<p>Convert unsigned 32-bit integer elements in <code>a</code> to single-precision floating point numbers.</p>
+<h3 id="operation_11">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.fp32[i] = (f32)(u32)a.word[i]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_11">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256-__lasx_xvffint_s_l-__m256i-a-__m256i-b">__m256 __lasx_xvffint_s_l (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_12">Synopsis</h3>
+<pre><code class="language-c++">__m256 __lasx_xvffint_s_l (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvffint.s.l xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_12">Description</h3>
+<p>Convert 64-bit integer elements in <code>a</code> and <code>b</code> to single-precision floating point numbers.</p>
+<h3 id="operation_12">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.fp32[i] =
+      (i &lt; 4) ? (f32)(s32)a.dword[i]
+              : (f32)(s32)b.dword[i]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_12">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>5</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>5</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvftintl_l_s-__m256-a">__m256i __lasx_xvftintl_l_s (__m256 a)</h2>
+<h3 id="synopsis_13">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvftintl_l_s (__m256 a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvftintl.l.s xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_13">Description</h3>
+<p>Convert single-precision floating point elements in lower part of <code>a</code> to 64-bit integer, using current rounding mode specified in <code>fscr</code>.</p>
+<h3 id="operation_13">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_13">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>5</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>5</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvftinth_l_s-__m256-a">__m256i __lasx_xvftinth_l_s (__m256 a)</h2>
+<h3 id="synopsis_14">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvftinth_l_s (__m256 a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvftinth.l.s xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_14">Description</h3>
+<p>Convert single-precision floating point elements in higher part of <code>a</code> to 64-bit integer, using current rounding mode specified in <code>fscr</code>.</p>
+<h3 id="operation_14">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = (s32)a.fp32[i + 4]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_14">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>5</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>5</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvftintrml_l_s-__m256-a">__m256i __lasx_xvftintrml_l_s (__m256 a)</h2>
+<h3 id="synopsis_15">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvftintrml_l_s (__m256 a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvftintrml.l.s xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_15">Description</h3>
+<p>Convert single-precision floating point elements in lower part of <code>a</code> to 64-bit integer, rounding towards negative infinity.</p>
+<h3 id="operation_15">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_15">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>5</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>5</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvftintrmh_l_s-__m256-a">__m256i __lasx_xvftintrmh_l_s (__m256 a)</h2>
+<h3 id="synopsis_16">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvftintrmh_l_s (__m256 a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvftintrmh.l.s xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_16">Description</h3>
+<p>Convert single-precision floating point elements in higher part of <code>a</code> to 64-bit integer, rounding towards negative infinity.</p>
+<h3 id="operation_16">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = (s32)a.fp32[i + 4]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_16">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>5</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>5</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvftintrpl_l_s-__m256-a">__m256i __lasx_xvftintrpl_l_s (__m256 a)</h2>
+<h3 id="synopsis_17">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvftintrpl_l_s (__m256 a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvftintrpl.l.s xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_17">Description</h3>
+<p>Convert single-precision floating point elements in lower part of <code>a</code> to 64-bit integer, rounding towards positive infinity.</p>
+<h3 id="operation_17">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_17">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>5</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>5</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvftintrph_l_s-__m256-a">__m256i __lasx_xvftintrph_l_s (__m256 a)</h2>
+<h3 id="synopsis_18">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvftintrph_l_s (__m256 a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvftintrph.l.s xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_18">Description</h3>
+<p>Convert single-precision floating point elements in higher part of <code>a</code> to 64-bit integer, rounding towards positive infinity.</p>
+<h3 id="operation_18">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = (s32)a.fp32[i + 4]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_18">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>5</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>5</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvftintrzl_l_s-__m256-a">__m256i __lasx_xvftintrzl_l_s (__m256 a)</h2>
+<h3 id="synopsis_19">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvftintrzl_l_s (__m256 a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvftintrzl.l.s xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_19">Description</h3>
+<p>Convert single-precision floating point elements in lower part of <code>a</code> to 64-bit integer, rounding towards zero.</p>
+<h3 id="operation_19">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_19">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>5</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>5</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvftintrzh_l_s-__m256-a">__m256i __lasx_xvftintrzh_l_s (__m256 a)</h2>
+<h3 id="synopsis_20">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvftintrzh_l_s (__m256 a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvftintrzh.l.s xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_20">Description</h3>
+<p>Convert single-precision floating point elements in higher part of <code>a</code> to 64-bit integer, rounding towards zero.</p>
+<h3 id="operation_20">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = (s32)a.fp32[i + 4]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_20">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>5</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>5</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvftintrnel_l_s-__m256-a">__m256i __lasx_xvftintrnel_l_s (__m256 a)</h2>
+<h3 id="synopsis_21">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvftintrnel_l_s (__m256 a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvftintrnel.l.s xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_21">Description</h3>
+<p>Convert single-precision floating point elements in lower part of <code>a</code> to 64-bit integer, rounding towards nearest even.</p>
+<h3 id="operation_21">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_21">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>5</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>5</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvftintrneh_l_s-__m256-a">__m256i __lasx_xvftintrneh_l_s (__m256 a)</h2>
+<h3 id="synopsis_22">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvftintrneh_l_s (__m256 a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvftintrneh.l.s xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_22">Description</h3>
+<p>Convert single-precision floating point elements in higher part of <code>a</code> to 64-bit integer, rounding towards nearest even.</p>
+<h3 id="operation_22">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = (s32)a.fp32[i + 4]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_22">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>5</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>5</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvftint_l_d-__m256d-a">__m256i __lasx_xvftint_l_d (__m256d a)</h2>
+<h3 id="synopsis_23">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvftint_l_d (__m256d a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvftint.l.d xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_23">Description</h3>
+<p>Convert double-precision floating point elements in <code>a</code> to signed 64-bit integer, using current rounding mode specified in <code>fscr</code>.</p>
+<h3 id="operation_23">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = (s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_23">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvftint_w_s-__m256-a">__m256i __lasx_xvftint_w_s (__m256 a)</h2>
+<h3 id="synopsis_24">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvftint_w_s (__m256 a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvftint.w.s xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_24">Description</h3>
+<p>Convert single-precision floating point elements in <code>a</code> to signed 32-bit integer, using current rounding mode specified in <code>fscr</code>.</p>
+<h3 id="operation_24">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_24">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvftintrm_l_d-__m256d-a">__m256i __lasx_xvftintrm_l_d (__m256d a)</h2>
+<h3 id="synopsis_25">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvftintrm_l_d (__m256d a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvftintrm.l.d xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_25">Description</h3>
+<p>Convert double-precision floating point elements in <code>a</code> to signed 64-bit integer, rounding towards negative infinity.</p>
+<h3 id="operation_25">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = (s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_25">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvftintrm_w_s-__m256-a">__m256i __lasx_xvftintrm_w_s (__m256 a)</h2>
+<h3 id="synopsis_26">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvftintrm_w_s (__m256 a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvftintrm.w.s xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_26">Description</h3>
+<p>Convert single-precision floating point elements in <code>a</code> to signed 32-bit integer, rounding towards negative infinity.</p>
+<h3 id="operation_26">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_26">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvftintrp_l_d-__m256d-a">__m256i __lasx_xvftintrp_l_d (__m256d a)</h2>
+<h3 id="synopsis_27">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvftintrp_l_d (__m256d a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvftintrp.l.d xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_27">Description</h3>
+<p>Convert double-precision floating point elements in <code>a</code> to signed 64-bit integer, rounding towards positive infinity.</p>
+<h3 id="operation_27">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = (s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_27">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvftintrp_w_s-__m256-a">__m256i __lasx_xvftintrp_w_s (__m256 a)</h2>
+<h3 id="synopsis_28">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvftintrp_w_s (__m256 a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvftintrp.w.s xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_28">Description</h3>
+<p>Convert single-precision floating point elements in <code>a</code> to signed 32-bit integer, rounding towards positive infinity.</p>
+<h3 id="operation_28">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_28">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvftintrz_l_d-__m256d-a">__m256i __lasx_xvftintrz_l_d (__m256d a)</h2>
+<h3 id="synopsis_29">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvftintrz_l_d (__m256d a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvftintrz.l.d xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_29">Description</h3>
+<p>Convert double-precision floating point elements in <code>a</code> to signed 64-bit integer, rounding towards zero.</p>
+<h3 id="operation_29">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = (s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_29">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvftintrz_w_s-__m256-a">__m256i __lasx_xvftintrz_w_s (__m256 a)</h2>
+<h3 id="synopsis_30">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvftintrz_w_s (__m256 a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvftintrz.w.s xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_30">Description</h3>
+<p>Convert single-precision floating point elements in <code>a</code> to signed 32-bit integer, rounding towards zero.</p>
+<h3 id="operation_30">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_30">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvftintrne_l_d-__m256d-a">__m256i __lasx_xvftintrne_l_d (__m256d a)</h2>
+<h3 id="synopsis_31">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvftintrne_l_d (__m256d a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvftintrne.l.d xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_31">Description</h3>
+<p>Convert double-precision floating point elements in <code>a</code> to signed 64-bit integer, rounding towards nearest even.</p>
+<h3 id="operation_31">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = (s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_31">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvftintrne_w_s-__m256-a">__m256i __lasx_xvftintrne_w_s (__m256 a)</h2>
+<h3 id="synopsis_32">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvftintrne_w_s (__m256 a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvftintrne.w.s xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_32">Description</h3>
+<p>Convert single-precision floating point elements in <code>a</code> to signed 32-bit integer, rounding towards nearest even.</p>
+<h3 id="operation_32">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_32">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvftint_lu_d-__m256d-a">__m256i __lasx_xvftint_lu_d (__m256d a)</h2>
+<h3 id="synopsis_33">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvftint_lu_d (__m256d a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvftint.lu.d xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_33">Description</h3>
+<p>Convert double-precision floating point elements in <code>a</code> to unsigned 64-bit integer, using current rounding mode specified in <code>fscr</code>.</p>
+<h3 id="operation_33">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = (u64)a.fp64[i]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_33">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvftint_wu_s-__m256-a">__m256i __lasx_xvftint_wu_s (__m256 a)</h2>
+<h3 id="synopsis_34">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvftint_wu_s (__m256 a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvftint.wu.s xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_34">Description</h3>
+<p>Convert single-precision floating point elements in <code>a</code> to unsigned 32-bit integer, using current rounding mode specified in <code>fscr</code>.</p>
+<h3 id="operation_34">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = (u32)a.fp32[i]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_34">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvftintrz_lu_d-__m256d-a">__m256i __lasx_xvftintrz_lu_d (__m256d a)</h2>
+<h3 id="synopsis_35">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvftintrz_lu_d (__m256d a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvftintrz.lu.d xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_35">Description</h3>
+<p>Convert double-precision floating point elements in <code>a</code> to unsigned 64-bit integer, rounding towards zero.</p>
+<h3 id="operation_35">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = (u64)a.fp64[i]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_35">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvftintrz_wu_s-__m256-a">__m256i __lasx_xvftintrz_wu_s (__m256 a)</h2>
+<h3 id="synopsis_36">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvftintrz_wu_s (__m256 a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvftintrz.wu.s xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_36">Description</h3>
+<p>Convert single-precision floating point elements in <code>a</code> to unsigned 32-bit integer, rounding towards zero.</p>
+<h3 id="operation_36">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = (u32)a.fp32[i]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_36">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvftint_w_d-__m256d-a-__m256d-b">__m256i __lasx_xvftint_w_d (__m256d a, __m256d b)</h2>
+<h3 id="synopsis_37">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvftint_w_d (__m256d a, __m256d b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvftint.w.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_37">Description</h3>
+<p>Convert double-precision floating point elements in <code>a</code> and <code>b</code> to 32-bit integer, using current rounding mode specified in <code>fscr</code>.</p>
+<h3 id="operation_37">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = (i &lt; 2)
+                     ? (s64)a.fp64[i]
+                     : (s64)b.fp64[i]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_37">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>5</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>5</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvftintrm_w_d-__m256d-a-__m256d-b">__m256i __lasx_xvftintrm_w_d (__m256d a, __m256d b)</h2>
+<h3 id="synopsis_38">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvftintrm_w_d (__m256d a, __m256d b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvftintrm.w.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_38">Description</h3>
+<p>Convert double-precision floating point elements in <code>a</code> and <code>b</code> to 32-bit integer, rounding towards negative infinity.</p>
+<h3 id="operation_38">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = (i &lt; 2)
+                     ? (s64)a.fp64[i]
+                     : (s64)b.fp64[i]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_38">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>5</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>5</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvftintrp_w_d-__m256d-a-__m256d-b">__m256i __lasx_xvftintrp_w_d (__m256d a, __m256d b)</h2>
+<h3 id="synopsis_39">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvftintrp_w_d (__m256d a, __m256d b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvftintrp.w.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_39">Description</h3>
+<p>Convert double-precision floating point elements in <code>a</code> and <code>b</code> to 32-bit integer, rounding towards positive infinity.</p>
+<h3 id="operation_39">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = (i &lt; 2)
+                     ? (s64)a.fp64[i]
+                     : (s64)b.fp64[i]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_39">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>5</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>5</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvftintrz_w_d-__m256d-a-__m256d-b">__m256i __lasx_xvftintrz_w_d (__m256d a, __m256d b)</h2>
+<h3 id="synopsis_40">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvftintrz_w_d (__m256d a, __m256d b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvftintrz.w.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_40">Description</h3>
+<p>Convert double-precision floating point elements in <code>a</code> and <code>b</code> to 32-bit integer, rounding towards zero.</p>
+<h3 id="operation_40">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = (i &lt; 2)
+                     ? (s64)a.fp64[i]
+                     : (s64)b.fp64[i]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_40">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>5</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>5</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvftintrne_w_d-__m256d-a-__m256d-b">__m256i __lasx_xvftintrne_w_d (__m256d a, __m256d b)</h2>
+<h3 id="synopsis_41">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvftintrne_w_d (__m256d a, __m256d b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvftintrne.w.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_41">Description</h3>
+<p>Convert double-precision floating point elements in <code>a</code> and <code>b</code> to 32-bit integer, rounding towards nearest even.</p>
+<h3 id="operation_41">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = (i &lt; 2)
+                     ? (s64)a.fp64[i]
+                     : (s64)b.fp64[i]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_41">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>5</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>5</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+              
+            </div>
+          </div><footer>
+    <div class="rst-footer-buttons" role="navigation" aria-label="Footer Navigation">
+        <a href="../float_computation/" class="btn btn-neutral float-left" title="Floating Point Computation"><span class="icon icon-circle-arrow-left"></span> Previous</a>
+        <a href="../float_misc/" class="btn btn-neutral float-right" title="Floating Point Misc">Next <span class="icon icon-circle-arrow-right"></span></a>
+    </div>
+
+  <hr/>
+
+  <div role="contentinfo">
+    <!-- Copyright etc -->
+      <p>Copyright &copy; 2023 Jiajie Chen</p>
+  </div>
+
+  Built with <a href="https://www.mkdocs.org/">MkDocs</a> using a <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
+</footer>
+          
+        </div>
+      </div>
+
+    </section>
+
+  </div>
+
+  <div class="rst-versions" role="note" aria-label="Versions">
+  <span class="rst-current-version" data-toggle="rst-current-version">
+    
+    
+      <span><a href="../float_computation/" style="color: #fcfcfc">&laquo; Previous</a></span>
+    
+    
+      <span><a href="../float_misc/" style="color: #fcfcfc">Next &raquo;</a></span>
+    
+  </span>
+</div>
+    <script src="../../js/jquery-3.6.0.min.js"></script>
+    <script>var base_url = "../..";</script>
+    <script src="../../js/theme_extra.js"></script>
+    <script src="../../js/theme.js"></script>
+    <script>
+        jQuery(function () {
+            SphinxRtdTheme.Navigation.enable(true);
+        });
+    </script>
+
+</body>
+</html>
diff --git a/lasx/float_misc/index.html b/lasx/float_misc/index.html
new file mode 100644
index 00000000..c00faa6d
--- /dev/null
+++ b/lasx/float_misc/index.html
@@ -0,0 +1,775 @@
+<!DOCTYPE html>
+<html class="writer-html5" lang="en" >
+<head>
+    <meta charset="utf-8" />
+    <meta http-equiv="X-UA-Compatible" content="IE=edge" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" /><link rel="canonical" href="https://jia.je/unofficial-loongarch-intrinsics-guide/lasx/float_misc/" />
+      <link rel="shortcut icon" href="../../img/favicon.ico" />
+    <title>Floating Point Misc - Unofficial LoongArch Intrinsics Guide</title>
+    <link rel="stylesheet" href="../../css/theme.css" />
+    <link rel="stylesheet" href="../../css/theme_extra.css" />
+        <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.8.0/styles/github.min.css" />
+        <link href="../../main.css" rel="stylesheet" />
+    
+      <script>
+        // Current page data
+        var mkdocs_page_name = "Floating Point Misc";
+        var mkdocs_page_input_path = "lasx/float_misc.md";
+        var mkdocs_page_url = "/unofficial-loongarch-intrinsics-guide/lasx/float_misc/";
+      </script>
+    
+    <!--[if lt IE 9]>
+      <script src="../../js/html5shiv.min.js"></script>
+    <![endif]-->
+      <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.8.0/highlight.min.js"></script>
+      <script>hljs.highlightAll();</script> 
+</head>
+
+<body class="wy-body-for-nav" role="document">
+
+  <div class="wy-grid-for-nav">
+    <nav data-toggle="wy-nav-shift" class="wy-nav-side stickynav">
+    <div class="wy-side-scroll">
+      <div class="wy-side-nav-search">
+          <a href="../.." class="icon icon-home"> Unofficial LoongArch Intrinsics Guide
+        </a>
+      </div>
+
+      <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../..">Unofficial LoongArch Intrinsics Guide</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../latency_throughput/">Latency and Throughput of Instructions</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../migrating_avx/">Migrating from AVX to LASX</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../migrating_sse/">Migrating from SSE to LSX</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../viewer/">Browse All Intrinsics</a>
+                </li>
+              </ul>
+              <p class="caption"><span class="caption-text">Lasx</span></p>
+              <ul class="current">
+                  <li class="toctree-l1"><a class="reference internal" href="../bitwise_operations/">Bitwise Operations</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../branch/">Branch</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_comparison/">Floating Point Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_computation/">Floating Point Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_conversion/">Floating Point Conversion</a>
+                  </li>
+                  <li class="toctree-l1 current"><a class="reference internal current" href="./">Floating Point Misc</a>
+    <ul class="current">
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvfclass_d-__m256d-a">__m256i __lasx_xvfclass_d (__m256d a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvfclass_s-__m256-a">__m256i __lasx_xvfclass_s (__m256 a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_1">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_1">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_1">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_1">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256-__lasx_xvfrint_s-__m256-a">__m256 __lasx_xvfrint_s (__m256 a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_2">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_2">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_2">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_2">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256d-__lasx_xvfrint_d-__m256d-a">__m256d __lasx_xvfrint_d (__m256d a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_3">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_3">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_3">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_3">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256-__lasx_xvfrintrp_s-__m256-a">__m256 __lasx_xvfrintrp_s (__m256 a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_4">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_4">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_4">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_4">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256d-__lasx_xvfrintrp_d-__m256d-a">__m256d __lasx_xvfrintrp_d (__m256d a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_5">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_5">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_5">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_5">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256-__lasx_xvfrintrm_s-__m256-a">__m256 __lasx_xvfrintrm_s (__m256 a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_6">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_6">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_6">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_6">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256d-__lasx_xvfrintrm_d-__m256d-a">__m256d __lasx_xvfrintrm_d (__m256d a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_7">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_7">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_7">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_7">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256-__lasx_xvfrintrz_s-__m256-a">__m256 __lasx_xvfrintrz_s (__m256 a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_8">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_8">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_8">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_8">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256d-__lasx_xvfrintrz_d-__m256d-a">__m256d __lasx_xvfrintrz_d (__m256d a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_9">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_9">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_9">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_9">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256-__lasx_xvfrintrne_s-__m256-a">__m256 __lasx_xvfrintrne_s (__m256 a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_10">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_10">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_10">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_10">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256d-__lasx_xvfrintrne_d-__m256d-a">__m256d __lasx_xvfrintrne_d (__m256d a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_11">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_11">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_11">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_11">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    </ul>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../fma/">Fused Multiply-Add</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../integer_comparison/">Integer Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../integer_computation/">Integer Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../logical/">Logical</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../memory/">Memory Load & Store</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../misc/">Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../permutation/">Permutation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../shift/">Shift</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../shuffling/">Shuffling</a>
+                  </li>
+              </ul>
+              <p class="caption"><span class="caption-text">Lsx</span></p>
+              <ul>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/bitwise_operations/">Bitwise Operations</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/branch/">Branch</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/float_comparison/">Floating Point Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/float_computation/">Floating Point Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/float_conversion/">Floating Point Conversion</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/float_misc/">Floating Point Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/fma/">Fused Multiply-Add</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/integer_comparison/">Integer Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/integer_computation/">Integer Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/logical/">Logical</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/memory/">Memory Load & Store</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/misc/">Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/permutation/">Permutation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/shift/">Shift</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/shuffling/">Shuffling</a>
+                  </li>
+              </ul>
+      </div>
+    </div>
+    </nav>
+
+    <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
+      <nav class="wy-nav-top" role="navigation" aria-label="Mobile navigation menu">
+          <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
+          <a href="../..">Unofficial LoongArch Intrinsics Guide</a>
+        
+      </nav>
+      <div class="wy-nav-content">
+        <div class="rst-content"><div role="navigation" aria-label="breadcrumbs navigation">
+  <ul class="wy-breadcrumbs">
+    <li><a href="../.." class="icon icon-home" aria-label="Docs"></a></li>
+          <li class="breadcrumb-item">Lasx</li>
+      <li class="breadcrumb-item active">Floating Point Misc</li>
+    <li class="wy-breadcrumbs-aside">
+    </li>
+  </ul>
+  <hr/>
+</div>
+          <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
+            <div class="section" itemprop="articleBody">
+              
+                <h1 id="floating-point-misc">Floating Point Misc</h1>
+<h2 id="__m256i-__lasx_xvfclass_d-__m256d-a">__m256i __lasx_xvfclass_d (__m256d a)</h2>
+<h3 id="synopsis">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvfclass_d (__m256d a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfclass.d xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description">Description</h3>
+<p>Classifiy each double precision floating point elements in <code>a</code>.</p>
+<h3 id="operation">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = fp_classify(a.fp64[i]);
+}
+</code></pre>
+<h3 id="latency-and-throughput">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvfclass_s-__m256-a">__m256i __lasx_xvfclass_s (__m256 a)</h2>
+<h3 id="synopsis_1">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvfclass_s (__m256 a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfclass.s xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_1">Description</h3>
+<p>Classifiy each single precision floating point elements in <code>a</code>.</p>
+<h3 id="operation_1">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.word[i] = fp_classify(a.fp32[i]);
+}
+</code></pre>
+<h3 id="latency-and-throughput_1">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256-__lasx_xvfrint_s-__m256-a">__m256 __lasx_xvfrint_s (__m256 a)</h2>
+<h3 id="synopsis_2">Synopsis</h3>
+<pre><code class="language-c++">__m256 __lasx_xvfrint_s (__m256 a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfrint.s xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_2">Description</h3>
+<p>Round single-precision floating point elements in <code>a</code> to integers, using current rounding mode specified in <code>fscr</code>, and store as floating point numbers.</p>
+<h3 id="operation_2">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.fp32[i] = (fp32)(s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_2">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256d-__lasx_xvfrint_d-__m256d-a">__m256d __lasx_xvfrint_d (__m256d a)</h2>
+<h3 id="synopsis_3">Synopsis</h3>
+<pre><code class="language-c++">__m256d __lasx_xvfrint_d (__m256d a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfrint.d xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_3">Description</h3>
+<p>Round single-precision floating point elements in <code>a</code> to integers, using current rounding mode specified in <code>fscr</code>, and store as floating point numbers.</p>
+<h3 id="operation_3">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.fp64[i] = (fp64)(s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_3">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256-__lasx_xvfrintrp_s-__m256-a">__m256 __lasx_xvfrintrp_s (__m256 a)</h2>
+<h3 id="synopsis_4">Synopsis</h3>
+<pre><code class="language-c++">__m256 __lasx_xvfrintrp_s (__m256 a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfrintrp.s xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_4">Description</h3>
+<p>Round single-precision floating point elements in <code>a</code> to integers, rounding towards positive infinity, and store as floating point numbers.</p>
+<h3 id="operation_4">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.fp32[i] = (fp32)(s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_4">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256d-__lasx_xvfrintrp_d-__m256d-a">__m256d __lasx_xvfrintrp_d (__m256d a)</h2>
+<h3 id="synopsis_5">Synopsis</h3>
+<pre><code class="language-c++">__m256d __lasx_xvfrintrp_d (__m256d a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfrintrp.d xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_5">Description</h3>
+<p>Round single-precision floating point elements in <code>a</code> to integers, rounding towards positive infinity, and store as floating point numbers.</p>
+<h3 id="operation_5">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.fp64[i] = (fp64)(s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_5">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256-__lasx_xvfrintrm_s-__m256-a">__m256 __lasx_xvfrintrm_s (__m256 a)</h2>
+<h3 id="synopsis_6">Synopsis</h3>
+<pre><code class="language-c++">__m256 __lasx_xvfrintrm_s (__m256 a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfrintrm.s xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_6">Description</h3>
+<p>Round single-precision floating point elements in <code>a</code> to integers, rounding towards negative infinity, and store as floating point numbers.</p>
+<h3 id="operation_6">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.fp32[i] = (fp32)(s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_6">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256d-__lasx_xvfrintrm_d-__m256d-a">__m256d __lasx_xvfrintrm_d (__m256d a)</h2>
+<h3 id="synopsis_7">Synopsis</h3>
+<pre><code class="language-c++">__m256d __lasx_xvfrintrm_d (__m256d a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfrintrm.d xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_7">Description</h3>
+<p>Round single-precision floating point elements in <code>a</code> to integers, rounding towards negative infinity, and store as floating point numbers.</p>
+<h3 id="operation_7">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.fp64[i] = (fp64)(s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_7">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256-__lasx_xvfrintrz_s-__m256-a">__m256 __lasx_xvfrintrz_s (__m256 a)</h2>
+<h3 id="synopsis_8">Synopsis</h3>
+<pre><code class="language-c++">__m256 __lasx_xvfrintrz_s (__m256 a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfrintrz.s xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_8">Description</h3>
+<p>Round single-precision floating point elements in <code>a</code> to integers, rounding towards zero, and store as floating point numbers.</p>
+<h3 id="operation_8">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.fp32[i] = (fp32)(s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_8">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256d-__lasx_xvfrintrz_d-__m256d-a">__m256d __lasx_xvfrintrz_d (__m256d a)</h2>
+<h3 id="synopsis_9">Synopsis</h3>
+<pre><code class="language-c++">__m256d __lasx_xvfrintrz_d (__m256d a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfrintrz.d xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_9">Description</h3>
+<p>Round single-precision floating point elements in <code>a</code> to integers, rounding towards zero, and store as floating point numbers.</p>
+<h3 id="operation_9">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.fp64[i] = (fp64)(s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_9">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256-__lasx_xvfrintrne_s-__m256-a">__m256 __lasx_xvfrintrne_s (__m256 a)</h2>
+<h3 id="synopsis_10">Synopsis</h3>
+<pre><code class="language-c++">__m256 __lasx_xvfrintrne_s (__m256 a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfrintrne.s xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_10">Description</h3>
+<p>Round single-precision floating point elements in <code>a</code> to integers, rounding towards nearest even, and store as floating point numbers.</p>
+<h3 id="operation_10">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.fp32[i] = (fp32)(s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_10">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256d-__lasx_xvfrintrne_d-__m256d-a">__m256d __lasx_xvfrintrne_d (__m256d a)</h2>
+<h3 id="synopsis_11">Synopsis</h3>
+<pre><code class="language-c++">__m256d __lasx_xvfrintrne_d (__m256d a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfrintrne.d xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_11">Description</h3>
+<p>Round single-precision floating point elements in <code>a</code> to integers, rounding towards nearest even, and store as floating point numbers.</p>
+<h3 id="operation_11">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.fp64[i] = (fp64)(s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_11">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+              
+            </div>
+          </div><footer>
+    <div class="rst-footer-buttons" role="navigation" aria-label="Footer Navigation">
+        <a href="../float_conversion/" class="btn btn-neutral float-left" title="Floating Point Conversion"><span class="icon icon-circle-arrow-left"></span> Previous</a>
+        <a href="../fma/" class="btn btn-neutral float-right" title="Fused Multiply-Add">Next <span class="icon icon-circle-arrow-right"></span></a>
+    </div>
+
+  <hr/>
+
+  <div role="contentinfo">
+    <!-- Copyright etc -->
+      <p>Copyright &copy; 2023 Jiajie Chen</p>
+  </div>
+
+  Built with <a href="https://www.mkdocs.org/">MkDocs</a> using a <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
+</footer>
+          
+        </div>
+      </div>
+
+    </section>
+
+  </div>
+
+  <div class="rst-versions" role="note" aria-label="Versions">
+  <span class="rst-current-version" data-toggle="rst-current-version">
+    
+    
+      <span><a href="../float_conversion/" style="color: #fcfcfc">&laquo; Previous</a></span>
+    
+    
+      <span><a href="../fma/" style="color: #fcfcfc">Next &raquo;</a></span>
+    
+  </span>
+</div>
+    <script src="../../js/jquery-3.6.0.min.js"></script>
+    <script>var base_url = "../..";</script>
+    <script src="../../js/theme_extra.js"></script>
+    <script src="../../js/theme.js"></script>
+    <script>
+        jQuery(function () {
+            SphinxRtdTheme.Navigation.enable(true);
+        });
+    </script>
+
+</body>
+</html>
diff --git a/lasx/fma/index.html b/lasx/fma/index.html
new file mode 100644
index 00000000..285180c4
--- /dev/null
+++ b/lasx/fma/index.html
@@ -0,0 +1,583 @@
+<!DOCTYPE html>
+<html class="writer-html5" lang="en" >
+<head>
+    <meta charset="utf-8" />
+    <meta http-equiv="X-UA-Compatible" content="IE=edge" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" /><link rel="canonical" href="https://jia.je/unofficial-loongarch-intrinsics-guide/lasx/fma/" />
+      <link rel="shortcut icon" href="../../img/favicon.ico" />
+    <title>Fused Multiply-Add - Unofficial LoongArch Intrinsics Guide</title>
+    <link rel="stylesheet" href="../../css/theme.css" />
+    <link rel="stylesheet" href="../../css/theme_extra.css" />
+        <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.8.0/styles/github.min.css" />
+        <link href="../../main.css" rel="stylesheet" />
+    
+      <script>
+        // Current page data
+        var mkdocs_page_name = "Fused Multiply-Add";
+        var mkdocs_page_input_path = "lasx/fma.md";
+        var mkdocs_page_url = "/unofficial-loongarch-intrinsics-guide/lasx/fma/";
+      </script>
+    
+    <!--[if lt IE 9]>
+      <script src="../../js/html5shiv.min.js"></script>
+    <![endif]-->
+      <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.8.0/highlight.min.js"></script>
+      <script>hljs.highlightAll();</script> 
+</head>
+
+<body class="wy-body-for-nav" role="document">
+
+  <div class="wy-grid-for-nav">
+    <nav data-toggle="wy-nav-shift" class="wy-nav-side stickynav">
+    <div class="wy-side-scroll">
+      <div class="wy-side-nav-search">
+          <a href="../.." class="icon icon-home"> Unofficial LoongArch Intrinsics Guide
+        </a>
+      </div>
+
+      <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../..">Unofficial LoongArch Intrinsics Guide</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../latency_throughput/">Latency and Throughput of Instructions</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../migrating_avx/">Migrating from AVX to LASX</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../migrating_sse/">Migrating from SSE to LSX</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../viewer/">Browse All Intrinsics</a>
+                </li>
+              </ul>
+              <p class="caption"><span class="caption-text">Lasx</span></p>
+              <ul class="current">
+                  <li class="toctree-l1"><a class="reference internal" href="../bitwise_operations/">Bitwise Operations</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../branch/">Branch</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_comparison/">Floating Point Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_computation/">Floating Point Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_conversion/">Floating Point Conversion</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_misc/">Floating Point Misc</a>
+                  </li>
+                  <li class="toctree-l1 current"><a class="reference internal current" href="./">Fused Multiply-Add</a>
+    <ul class="current">
+    <li class="toctree-l2"><a class="reference internal" href="#__m256d-__lasx_xvfmadd_d-__m256d-a-__m256d-b-__m256d-c">__m256d __lasx_xvfmadd_d (__m256d a, __m256d b, __m256d c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256-__lasx_xvfmadd_s-__m256-a-__m256-b-__m256-c">__m256 __lasx_xvfmadd_s (__m256 a, __m256 b, __m256 c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_1">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_1">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_1">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_1">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256d-__lasx_xvfmsub_d-__m256d-a-__m256d-b-__m256d-c">__m256d __lasx_xvfmsub_d (__m256d a, __m256d b, __m256d c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_2">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_2">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_2">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_2">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256-__lasx_xvfmsub_s-__m256-a-__m256-b-__m256-c">__m256 __lasx_xvfmsub_s (__m256 a, __m256 b, __m256 c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_3">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_3">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_3">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_3">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256d-__lasx_xvfnmadd_d-__m256d-a-__m256d-b-__m256d-c">__m256d __lasx_xvfnmadd_d (__m256d a, __m256d b, __m256d c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_4">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_4">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_4">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_4">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256-__lasx_xvfnmadd_s-__m256-a-__m256-b-__m256-c">__m256 __lasx_xvfnmadd_s (__m256 a, __m256 b, __m256 c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_5">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_5">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_5">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_5">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256d-__lasx_xvfnmsub_d-__m256d-a-__m256d-b-__m256d-c">__m256d __lasx_xvfnmsub_d (__m256d a, __m256d b, __m256d c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_6">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_6">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_6">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_6">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256-__lasx_xvfnmsub_s-__m256-a-__m256-b-__m256-c">__m256 __lasx_xvfnmsub_s (__m256 a, __m256 b, __m256 c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_7">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_7">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_7">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_7">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    </ul>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../integer_comparison/">Integer Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../integer_computation/">Integer Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../logical/">Logical</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../memory/">Memory Load & Store</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../misc/">Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../permutation/">Permutation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../shift/">Shift</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../shuffling/">Shuffling</a>
+                  </li>
+              </ul>
+              <p class="caption"><span class="caption-text">Lsx</span></p>
+              <ul>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/bitwise_operations/">Bitwise Operations</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/branch/">Branch</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/float_comparison/">Floating Point Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/float_computation/">Floating Point Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/float_conversion/">Floating Point Conversion</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/float_misc/">Floating Point Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/fma/">Fused Multiply-Add</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/integer_comparison/">Integer Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/integer_computation/">Integer Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/logical/">Logical</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/memory/">Memory Load & Store</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/misc/">Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/permutation/">Permutation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/shift/">Shift</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/shuffling/">Shuffling</a>
+                  </li>
+              </ul>
+      </div>
+    </div>
+    </nav>
+
+    <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
+      <nav class="wy-nav-top" role="navigation" aria-label="Mobile navigation menu">
+          <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
+          <a href="../..">Unofficial LoongArch Intrinsics Guide</a>
+        
+      </nav>
+      <div class="wy-nav-content">
+        <div class="rst-content"><div role="navigation" aria-label="breadcrumbs navigation">
+  <ul class="wy-breadcrumbs">
+    <li><a href="../.." class="icon icon-home" aria-label="Docs"></a></li>
+          <li class="breadcrumb-item">Lasx</li>
+      <li class="breadcrumb-item active">Fused Multiply-Add</li>
+    <li class="wy-breadcrumbs-aside">
+    </li>
+  </ul>
+  <hr/>
+</div>
+          <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
+            <div class="section" itemprop="articleBody">
+              
+                <h1 id="fused-multiply-add">Fused Multiply-Add</h1>
+<h2 id="__m256d-__lasx_xvfmadd_d-__m256d-a-__m256d-b-__m256d-c">__m256d __lasx_xvfmadd_d (__m256d a, __m256d b, __m256d c)</h2>
+<h3 id="synopsis">Synopsis</h3>
+<pre><code class="language-c++">__m256d __lasx_xvfmadd_d (__m256d a, __m256d b, __m256d c)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfmadd.d xr, xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description">Description</h3>
+<p>Compute packed double precision floating point FMA(Fused Multiply-Add): multiply elements in <code>a</code> and <code>b</code>, accumulate to elements in <code>c</code> and store the result in <code>dst</code>.</p>
+<h3 id="operation">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.fp64[i] = a.fp64[i] * b.fp64[i] + c.fp64[i];
+}
+</code></pre>
+<h3 id="latency-and-throughput">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>5</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>5</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256-__lasx_xvfmadd_s-__m256-a-__m256-b-__m256-c">__m256 __lasx_xvfmadd_s (__m256 a, __m256 b, __m256 c)</h2>
+<h3 id="synopsis_1">Synopsis</h3>
+<pre><code class="language-c++">__m256 __lasx_xvfmadd_s (__m256 a, __m256 b, __m256 c)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfmadd.s xr, xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_1">Description</h3>
+<p>Compute packed single precision floating point FMA(Fused Multiply-Add): multiply elements in <code>a</code> and <code>b</code>, accumulate to elements in <code>c</code> and store the result in <code>dst</code>.</p>
+<h3 id="operation_1">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.fp32[i] = a.fp32[i] * b.fp32[i] + c.fp32[i];
+}
+</code></pre>
+<h3 id="latency-and-throughput_1">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>5</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>5</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256d-__lasx_xvfmsub_d-__m256d-a-__m256d-b-__m256d-c">__m256d __lasx_xvfmsub_d (__m256d a, __m256d b, __m256d c)</h2>
+<h3 id="synopsis_2">Synopsis</h3>
+<pre><code class="language-c++">__m256d __lasx_xvfmsub_d (__m256d a, __m256d b, __m256d c)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfmsub.d xr, xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_2">Description</h3>
+<p>Compute packed double precision floating point FMA(Fused Multiply-Add): multiply elements in <code>a</code> and <code>b</code>, subtract elements in <code>c</code> and store the result in <code>dst</code>.</p>
+<h3 id="operation_2">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.fp64[i] = a.fp64[i] * b.fp64[i] - c.fp64[i];
+}
+</code></pre>
+<h3 id="latency-and-throughput_2">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>5</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>5</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256-__lasx_xvfmsub_s-__m256-a-__m256-b-__m256-c">__m256 __lasx_xvfmsub_s (__m256 a, __m256 b, __m256 c)</h2>
+<h3 id="synopsis_3">Synopsis</h3>
+<pre><code class="language-c++">__m256 __lasx_xvfmsub_s (__m256 a, __m256 b, __m256 c)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfmsub.s xr, xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_3">Description</h3>
+<p>Compute packed single precision floating point FMA(Fused Multiply-Add): multiply elements in <code>a</code> and <code>b</code>, subtract elements in <code>c</code> and store the result in <code>dst</code>.</p>
+<h3 id="operation_3">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.fp32[i] = a.fp32[i] * b.fp32[i] - c.fp32[i];
+}
+</code></pre>
+<h3 id="latency-and-throughput_3">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>5</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>5</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256d-__lasx_xvfnmadd_d-__m256d-a-__m256d-b-__m256d-c">__m256d __lasx_xvfnmadd_d (__m256d a, __m256d b, __m256d c)</h2>
+<h3 id="synopsis_4">Synopsis</h3>
+<pre><code class="language-c++">__m256d __lasx_xvfnmadd_d (__m256d a, __m256d b, __m256d c)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfnmadd.d xr, xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_4">Description</h3>
+<p>Compute packed double precision floating point FMA(Fused Multiply-Add): multiply elements in <code>a</code> and <code>b</code>, accumulate to elements in <code>c</code> and store the negated result in <code>dst</code>.</p>
+<h3 id="operation_4">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.fp64[i] = (a.fp64[i] * b.fp64[i] + c.fp64[i]);
+}
+</code></pre>
+<h3 id="latency-and-throughput_4">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>5</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>5</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256-__lasx_xvfnmadd_s-__m256-a-__m256-b-__m256-c">__m256 __lasx_xvfnmadd_s (__m256 a, __m256 b, __m256 c)</h2>
+<h3 id="synopsis_5">Synopsis</h3>
+<pre><code class="language-c++">__m256 __lasx_xvfnmadd_s (__m256 a, __m256 b, __m256 c)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfnmadd.s xr, xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_5">Description</h3>
+<p>Compute packed single precision floating point FMA(Fused Multiply-Add): multiply elements in <code>a</code> and <code>b</code>, accumulate to elements in <code>c</code> and store the negated result in <code>dst</code>.</p>
+<h3 id="operation_5">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.fp32[i] = -(a.fp32[i] * b.fp32[i] + c.fp32[i]);
+}
+</code></pre>
+<h3 id="latency-and-throughput_5">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>5</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>5</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256d-__lasx_xvfnmsub_d-__m256d-a-__m256d-b-__m256d-c">__m256d __lasx_xvfnmsub_d (__m256d a, __m256d b, __m256d c)</h2>
+<h3 id="synopsis_6">Synopsis</h3>
+<pre><code class="language-c++">__m256d __lasx_xvfnmsub_d (__m256d a, __m256d b, __m256d c)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfnmsub.d xr, xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_6">Description</h3>
+<p>Compute packed double precision floating point FMA(Fused Multiply-Add): multiply elements in <code>a</code> and <code>b</code>, subtract elements in <code>c</code> and store the negated result in <code>dst</code>.</p>
+<h3 id="operation_6">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.fp64[i] = -(a.fp64[i] * b.fp64[i] - c.fp64[i]);
+}
+</code></pre>
+<h3 id="latency-and-throughput_6">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>5</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>5</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256-__lasx_xvfnmsub_s-__m256-a-__m256-b-__m256-c">__m256 __lasx_xvfnmsub_s (__m256 a, __m256 b, __m256 c)</h2>
+<h3 id="synopsis_7">Synopsis</h3>
+<pre><code class="language-c++">__m256 __lasx_xvfnmsub_s (__m256 a, __m256 b, __m256 c)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfnmsub.s xr, xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_7">Description</h3>
+<p>Compute packed single precision floating point FMA(Fused Multiply-Add): multiply elements in <code>a</code> and <code>b</code>, subtract elements in <code>c</code> and store the negated result in <code>dst</code>.</p>
+<h3 id="operation_7">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.fp32[i] = -(a.fp32[i] * b.fp32[i] - c.fp32[i]);
+}
+</code></pre>
+<h3 id="latency-and-throughput_7">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>5</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>5</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+              
+            </div>
+          </div><footer>
+    <div class="rst-footer-buttons" role="navigation" aria-label="Footer Navigation">
+        <a href="../float_misc/" class="btn btn-neutral float-left" title="Floating Point Misc"><span class="icon icon-circle-arrow-left"></span> Previous</a>
+        <a href="../integer_comparison/" class="btn btn-neutral float-right" title="Integer Comparison">Next <span class="icon icon-circle-arrow-right"></span></a>
+    </div>
+
+  <hr/>
+
+  <div role="contentinfo">
+    <!-- Copyright etc -->
+      <p>Copyright &copy; 2023 Jiajie Chen</p>
+  </div>
+
+  Built with <a href="https://www.mkdocs.org/">MkDocs</a> using a <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
+</footer>
+          
+        </div>
+      </div>
+
+    </section>
+
+  </div>
+
+  <div class="rst-versions" role="note" aria-label="Versions">
+  <span class="rst-current-version" data-toggle="rst-current-version">
+    
+    
+      <span><a href="../float_misc/" style="color: #fcfcfc">&laquo; Previous</a></span>
+    
+    
+      <span><a href="../integer_comparison/" style="color: #fcfcfc">Next &raquo;</a></span>
+    
+  </span>
+</div>
+    <script src="../../js/jquery-3.6.0.min.js"></script>
+    <script>var base_url = "../..";</script>
+    <script src="../../js/theme_extra.js"></script>
+    <script src="../../js/theme.js"></script>
+    <script>
+        jQuery(function () {
+            SphinxRtdTheme.Navigation.enable(true);
+        });
+    </script>
+
+</body>
+</html>
diff --git a/lasx/integer_comparison/index.html b/lasx/integer_comparison/index.html
new file mode 100644
index 00000000..89463740
--- /dev/null
+++ b/lasx/integer_comparison/index.html
@@ -0,0 +1,2159 @@
+<!DOCTYPE html>
+<html class="writer-html5" lang="en" >
+<head>
+    <meta charset="utf-8" />
+    <meta http-equiv="X-UA-Compatible" content="IE=edge" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" /><link rel="canonical" href="https://jia.je/unofficial-loongarch-intrinsics-guide/lasx/integer_comparison/" />
+      <link rel="shortcut icon" href="../../img/favicon.ico" />
+    <title>Integer Comparison - Unofficial LoongArch Intrinsics Guide</title>
+    <link rel="stylesheet" href="../../css/theme.css" />
+    <link rel="stylesheet" href="../../css/theme_extra.css" />
+        <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.8.0/styles/github.min.css" />
+        <link href="../../main.css" rel="stylesheet" />
+    
+      <script>
+        // Current page data
+        var mkdocs_page_name = "Integer Comparison";
+        var mkdocs_page_input_path = "lasx/integer_comparison.md";
+        var mkdocs_page_url = "/unofficial-loongarch-intrinsics-guide/lasx/integer_comparison/";
+      </script>
+    
+    <!--[if lt IE 9]>
+      <script src="../../js/html5shiv.min.js"></script>
+    <![endif]-->
+      <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.8.0/highlight.min.js"></script>
+      <script>hljs.highlightAll();</script> 
+</head>
+
+<body class="wy-body-for-nav" role="document">
+
+  <div class="wy-grid-for-nav">
+    <nav data-toggle="wy-nav-shift" class="wy-nav-side stickynav">
+    <div class="wy-side-scroll">
+      <div class="wy-side-nav-search">
+          <a href="../.." class="icon icon-home"> Unofficial LoongArch Intrinsics Guide
+        </a>
+      </div>
+
+      <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../..">Unofficial LoongArch Intrinsics Guide</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../latency_throughput/">Latency and Throughput of Instructions</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../migrating_avx/">Migrating from AVX to LASX</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../migrating_sse/">Migrating from SSE to LSX</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../viewer/">Browse All Intrinsics</a>
+                </li>
+              </ul>
+              <p class="caption"><span class="caption-text">Lasx</span></p>
+              <ul class="current">
+                  <li class="toctree-l1"><a class="reference internal" href="../bitwise_operations/">Bitwise Operations</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../branch/">Branch</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_comparison/">Floating Point Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_computation/">Floating Point Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_conversion/">Floating Point Conversion</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_misc/">Floating Point Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../fma/">Fused Multiply-Add</a>
+                  </li>
+                  <li class="toctree-l1 current"><a class="reference internal current" href="./">Integer Comparison</a>
+    <ul class="current">
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvseq_b-__m256i-a-__m256i-b">__m256i __lasx_xvseq_b (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvseq_h-__m256i-a-__m256i-b">__m256i __lasx_xvseq_h (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_1">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_1">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_1">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_1">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvseq_w-__m256i-a-__m256i-b">__m256i __lasx_xvseq_w (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_2">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_2">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_2">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_2">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvseq_d-__m256i-a-__m256i-b">__m256i __lasx_xvseq_d (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_3">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_3">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_3">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_3">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvseqi_b-__m256i-a-imm_n16_15-imm">__m256i __lasx_xvseqi_b (__m256i a, imm_n16_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_4">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_4">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_4">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_4">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvseqi_h-__m256i-a-imm_n16_15-imm">__m256i __lasx_xvseqi_h (__m256i a, imm_n16_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_5">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_5">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_5">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_5">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvseqi_w-__m256i-a-imm_n16_15-imm">__m256i __lasx_xvseqi_w (__m256i a, imm_n16_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_6">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_6">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_6">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_6">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvseqi_d-__m256i-a-imm_n16_15-imm">__m256i __lasx_xvseqi_d (__m256i a, imm_n16_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_7">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_7">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_7">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_7">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvslt_b-__m256i-a-__m256i-b">__m256i __lasx_xvslt_b (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_8">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_8">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_8">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_8">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvslt_bu-__m256i-a-__m256i-b">__m256i __lasx_xvslt_bu (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_9">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_9">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_9">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_9">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvslt_h-__m256i-a-__m256i-b">__m256i __lasx_xvslt_h (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_10">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_10">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_10">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_10">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvslt_hu-__m256i-a-__m256i-b">__m256i __lasx_xvslt_hu (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_11">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_11">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_11">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_11">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvslt_w-__m256i-a-__m256i-b">__m256i __lasx_xvslt_w (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_12">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_12">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_12">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_12">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvslt_wu-__m256i-a-__m256i-b">__m256i __lasx_xvslt_wu (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_13">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_13">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_13">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_13">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvslt_d-__m256i-a-__m256i-b">__m256i __lasx_xvslt_d (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_14">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_14">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_14">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_14">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvslt_du-__m256i-a-__m256i-b">__m256i __lasx_xvslt_du (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_15">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_15">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_15">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_15">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvslti_b-__m256i-a-imm_n16_15-imm">__m256i __lasx_xvslti_b (__m256i a, imm_n16_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_16">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_16">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_16">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_16">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvslti_bu-__m256i-a-imm0_31-imm">__m256i __lasx_xvslti_bu (__m256i a, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_17">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_17">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_17">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_17">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvslti_h-__m256i-a-imm_n16_15-imm">__m256i __lasx_xvslti_h (__m256i a, imm_n16_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_18">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_18">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_18">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_18">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvslti_hu-__m256i-a-imm0_31-imm">__m256i __lasx_xvslti_hu (__m256i a, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_19">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_19">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_19">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_19">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvslti_w-__m256i-a-imm_n16_15-imm">__m256i __lasx_xvslti_w (__m256i a, imm_n16_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_20">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_20">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_20">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_20">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvslti_wu-__m256i-a-imm0_31-imm">__m256i __lasx_xvslti_wu (__m256i a, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_21">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_21">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_21">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_21">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvslti_d-__m256i-a-imm_n16_15-imm">__m256i __lasx_xvslti_d (__m256i a, imm_n16_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_22">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_22">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_22">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_22">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvslti_du-__m256i-a-imm0_31-imm">__m256i __lasx_xvslti_du (__m256i a, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_23">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_23">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_23">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_23">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsle_b-__m256i-a-__m256i-b">__m256i __lasx_xvsle_b (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_24">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_24">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_24">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_24">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsle_bu-__m256i-a-__m256i-b">__m256i __lasx_xvsle_bu (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_25">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_25">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_25">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_25">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsle_h-__m256i-a-__m256i-b">__m256i __lasx_xvsle_h (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_26">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_26">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_26">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_26">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsle_hu-__m256i-a-__m256i-b">__m256i __lasx_xvsle_hu (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_27">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_27">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_27">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_27">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsle_w-__m256i-a-__m256i-b">__m256i __lasx_xvsle_w (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_28">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_28">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_28">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_28">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsle_wu-__m256i-a-__m256i-b">__m256i __lasx_xvsle_wu (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_29">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_29">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_29">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_29">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsle_d-__m256i-a-__m256i-b">__m256i __lasx_xvsle_d (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_30">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_30">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_30">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_30">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsle_du-__m256i-a-__m256i-b">__m256i __lasx_xvsle_du (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_31">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_31">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_31">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_31">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvslei_b-__m256i-a-imm_n16_15-imm">__m256i __lasx_xvslei_b (__m256i a, imm_n16_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_32">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_32">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_32">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_32">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvslei_bu-__m256i-a-imm0_31-imm">__m256i __lasx_xvslei_bu (__m256i a, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_33">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_33">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_33">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_33">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvslei_h-__m256i-a-imm_n16_15-imm">__m256i __lasx_xvslei_h (__m256i a, imm_n16_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_34">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_34">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_34">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_34">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvslei_hu-__m256i-a-imm0_31-imm">__m256i __lasx_xvslei_hu (__m256i a, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_35">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_35">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_35">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_35">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvslei_w-__m256i-a-imm_n16_15-imm">__m256i __lasx_xvslei_w (__m256i a, imm_n16_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_36">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_36">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_36">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_36">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvslei_wu-__m256i-a-imm0_31-imm">__m256i __lasx_xvslei_wu (__m256i a, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_37">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_37">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_37">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_37">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvslei_d-__m256i-a-imm_n16_15-imm">__m256i __lasx_xvslei_d (__m256i a, imm_n16_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_38">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_38">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_38">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_38">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvslei_du-__m256i-a-imm0_31-imm">__m256i __lasx_xvslei_du (__m256i a, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_39">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_39">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_39">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_39">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    </ul>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../integer_computation/">Integer Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../logical/">Logical</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../memory/">Memory Load & Store</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../misc/">Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../permutation/">Permutation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../shift/">Shift</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../shuffling/">Shuffling</a>
+                  </li>
+              </ul>
+              <p class="caption"><span class="caption-text">Lsx</span></p>
+              <ul>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/bitwise_operations/">Bitwise Operations</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/branch/">Branch</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/float_comparison/">Floating Point Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/float_computation/">Floating Point Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/float_conversion/">Floating Point Conversion</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/float_misc/">Floating Point Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/fma/">Fused Multiply-Add</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/integer_comparison/">Integer Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/integer_computation/">Integer Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/logical/">Logical</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/memory/">Memory Load & Store</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/misc/">Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/permutation/">Permutation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/shift/">Shift</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/shuffling/">Shuffling</a>
+                  </li>
+              </ul>
+      </div>
+    </div>
+    </nav>
+
+    <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
+      <nav class="wy-nav-top" role="navigation" aria-label="Mobile navigation menu">
+          <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
+          <a href="../..">Unofficial LoongArch Intrinsics Guide</a>
+        
+      </nav>
+      <div class="wy-nav-content">
+        <div class="rst-content"><div role="navigation" aria-label="breadcrumbs navigation">
+  <ul class="wy-breadcrumbs">
+    <li><a href="../.." class="icon icon-home" aria-label="Docs"></a></li>
+          <li class="breadcrumb-item">Lasx</li>
+      <li class="breadcrumb-item active">Integer Comparison</li>
+    <li class="wy-breadcrumbs-aside">
+    </li>
+  </ul>
+  <hr/>
+</div>
+          <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
+            <div class="section" itemprop="articleBody">
+              
+                <h1 id="integer-comparison">Integer Comparison</h1>
+<h2 id="__m256i-__lasx_xvseq_b-__m256i-a-__m256i-b">__m256i __lasx_xvseq_b (__m256i a, __m256i b)</h2>
+<h3 id="synopsis">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvseq_b (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvseq.b xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description">Description</h3>
+<p>Compare the 8-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if equal, zero otherwise.</p>
+<h3 id="operation">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 32; i++) {
+  dst.byte[i] = (a.byte[i] == b.byte[i]) ? 0xFF : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvseq_h-__m256i-a-__m256i-b">__m256i __lasx_xvseq_h (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_1">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvseq_h (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvseq.h xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_1">Description</h3>
+<p>Compare the 16-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if equal, zero otherwise.</p>
+<h3 id="operation_1">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = (a.half[i] == b.half[i]) ? 0xFFFF : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_1">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvseq_w-__m256i-a-__m256i-b">__m256i __lasx_xvseq_w (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_2">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvseq_w (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvseq.w xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_2">Description</h3>
+<p>Compare the 32-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if equal, zero otherwise.</p>
+<h3 id="operation_2">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = (a.word[i] == b.word[i]) ? 0xFFFFFFFF : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_2">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvseq_d-__m256i-a-__m256i-b">__m256i __lasx_xvseq_d (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_3">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvseq_d (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvseq.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_3">Description</h3>
+<p>Compare the 64-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if equal, zero otherwise.</p>
+<h3 id="operation_3">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = (a.dword[i] == b.dword[i]) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_3">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvseqi_b-__m256i-a-imm_n16_15-imm">__m256i __lasx_xvseqi_b (__m256i a, imm_n16_15 imm)</h2>
+<h3 id="synopsis_4">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvseqi_b (__m256i a, imm_n16_15 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvseqi.b xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_4">Description</h3>
+<p>Compare the 8-bit elements in <code>a</code> and <code>imm</code>, store all-ones to <code>dst</code> if equal, zero otherwise.</p>
+<h3 id="operation_4">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 32; i++) {
+  dst.byte[i] = ((s8)a.byte[i] == imm) ? 0xFF : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_4">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvseqi_h-__m256i-a-imm_n16_15-imm">__m256i __lasx_xvseqi_h (__m256i a, imm_n16_15 imm)</h2>
+<h3 id="synopsis_5">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvseqi_h (__m256i a, imm_n16_15 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvseqi.h xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_5">Description</h3>
+<p>Compare the 16-bit elements in <code>a</code> and <code>imm</code>, store all-ones to <code>dst</code> if equal, zero otherwise.</p>
+<h3 id="operation_5">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = ((s16)a.half[i] == imm) ? 0xFFFF : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_5">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvseqi_w-__m256i-a-imm_n16_15-imm">__m256i __lasx_xvseqi_w (__m256i a, imm_n16_15 imm)</h2>
+<h3 id="synopsis_6">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvseqi_w (__m256i a, imm_n16_15 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvseqi.w xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_6">Description</h3>
+<p>Compare the 32-bit elements in <code>a</code> and <code>imm</code>, store all-ones to <code>dst</code> if equal, zero otherwise.</p>
+<h3 id="operation_6">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = ((s32)a.word[i] == imm) ? 0xFFFFFFFF : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_6">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvseqi_d-__m256i-a-imm_n16_15-imm">__m256i __lasx_xvseqi_d (__m256i a, imm_n16_15 imm)</h2>
+<h3 id="synopsis_7">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvseqi_d (__m256i a, imm_n16_15 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvseqi.d xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_7">Description</h3>
+<p>Compare the 64-bit elements in <code>a</code> and <code>imm</code>, store all-ones to <code>dst</code> if equal, zero otherwise.</p>
+<h3 id="operation_7">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = ((s64)a.dword[i] == imm) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_7">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvslt_b-__m256i-a-__m256i-b">__m256i __lasx_xvslt_b (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_8">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvslt_b (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvslt.b xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_8">Description</h3>
+<p>Compare the signed 8-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than <code>b</code>, zero otherwise.</p>
+<h3 id="operation_8">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 32; i++) {
+  dst.byte[i] = ((s8)a.byte[i] &lt; (s8)b.byte[i]) ? 0xFF : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_8">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvslt_bu-__m256i-a-__m256i-b">__m256i __lasx_xvslt_bu (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_9">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvslt_bu (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvslt.bu xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_9">Description</h3>
+<p>Compare the unsigned 8-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than <code>b</code>, zero otherwise.</p>
+<h3 id="operation_9">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 32; i++) {
+  dst.byte[i] = ((u8)a.byte[i] &lt; (u8)b.byte[i]) ? 0xFF : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_9">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvslt_h-__m256i-a-__m256i-b">__m256i __lasx_xvslt_h (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_10">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvslt_h (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvslt.h xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_10">Description</h3>
+<p>Compare the signed 16-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than <code>b</code>, zero otherwise.</p>
+<h3 id="operation_10">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = ((s16)a.half[i] &lt; (s16)b.half[i]) ? 0xFFFF : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_10">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvslt_hu-__m256i-a-__m256i-b">__m256i __lasx_xvslt_hu (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_11">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvslt_hu (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvslt.hu xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_11">Description</h3>
+<p>Compare the unsigned 16-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than <code>b</code>, zero otherwise.</p>
+<h3 id="operation_11">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = ((u16)a.half[i] &lt; (u16)b.half[i]) ? 0xFFFF : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_11">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvslt_w-__m256i-a-__m256i-b">__m256i __lasx_xvslt_w (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_12">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvslt_w (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvslt.w xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_12">Description</h3>
+<p>Compare the signed 32-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than <code>b</code>, zero otherwise.</p>
+<h3 id="operation_12">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = ((s32)a.word[i] &lt; (s32)b.word[i]) ? 0xFFFFFFFF : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_12">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvslt_wu-__m256i-a-__m256i-b">__m256i __lasx_xvslt_wu (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_13">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvslt_wu (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvslt.wu xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_13">Description</h3>
+<p>Compare the unsigned 32-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than <code>b</code>, zero otherwise.</p>
+<h3 id="operation_13">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = ((u32)a.word[i] &lt; (u32)b.word[i]) ? 0xFFFFFFFF : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_13">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvslt_d-__m256i-a-__m256i-b">__m256i __lasx_xvslt_d (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_14">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvslt_d (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvslt.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_14">Description</h3>
+<p>Compare the signed 64-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than <code>b</code>, zero otherwise.</p>
+<h3 id="operation_14">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = ((s64)a.dword[i] &lt; (s64)b.dword[i]) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_14">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvslt_du-__m256i-a-__m256i-b">__m256i __lasx_xvslt_du (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_15">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvslt_du (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvslt.du xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_15">Description</h3>
+<p>Compare the unsigned 64-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than <code>b</code>, zero otherwise.</p>
+<h3 id="operation_15">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = ((u64)a.dword[i] &lt; (u64)b.dword[i]) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_15">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvslti_b-__m256i-a-imm_n16_15-imm">__m256i __lasx_xvslti_b (__m256i a, imm_n16_15 imm)</h2>
+<h3 id="synopsis_16">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvslti_b (__m256i a, imm_n16_15 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvslti.b xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_16">Description</h3>
+<p>Compare the signed 8-bit elements in <code>a</code> and <code>imm</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than <code>b</code>, zero otherwise.</p>
+<h3 id="operation_16">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 32; i++) {
+  dst.byte[i] = ((s8)a.byte[i] &lt; imm) ? 0xFF : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_16">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvslti_bu-__m256i-a-imm0_31-imm">__m256i __lasx_xvslti_bu (__m256i a, imm0_31 imm)</h2>
+<h3 id="synopsis_17">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvslti_bu (__m256i a, imm0_31 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvslti.bu xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_17">Description</h3>
+<p>Compare the unsigned 8-bit elements in <code>a</code> and <code>imm</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than <code>b</code>, zero otherwise.</p>
+<h3 id="operation_17">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 32; i++) {
+  dst.byte[i] = ((u8)a.byte[i] &lt; imm) ? 0xFF : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_17">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvslti_h-__m256i-a-imm_n16_15-imm">__m256i __lasx_xvslti_h (__m256i a, imm_n16_15 imm)</h2>
+<h3 id="synopsis_18">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvslti_h (__m256i a, imm_n16_15 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvslti.h xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_18">Description</h3>
+<p>Compare the signed 16-bit elements in <code>a</code> and <code>imm</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than <code>b</code>, zero otherwise.</p>
+<h3 id="operation_18">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = ((s16)a.half[i] &lt; imm) ? 0xFFFF : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_18">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvslti_hu-__m256i-a-imm0_31-imm">__m256i __lasx_xvslti_hu (__m256i a, imm0_31 imm)</h2>
+<h3 id="synopsis_19">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvslti_hu (__m256i a, imm0_31 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvslti.hu xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_19">Description</h3>
+<p>Compare the unsigned 16-bit elements in <code>a</code> and <code>imm</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than <code>b</code>, zero otherwise.</p>
+<h3 id="operation_19">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = ((u16)a.half[i] &lt; imm) ? 0xFFFF : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_19">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvslti_w-__m256i-a-imm_n16_15-imm">__m256i __lasx_xvslti_w (__m256i a, imm_n16_15 imm)</h2>
+<h3 id="synopsis_20">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvslti_w (__m256i a, imm_n16_15 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvslti.w xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_20">Description</h3>
+<p>Compare the signed 32-bit elements in <code>a</code> and <code>imm</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than <code>b</code>, zero otherwise.</p>
+<h3 id="operation_20">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = ((s32)a.word[i] &lt; imm) ? 0xFFFFFFFF : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_20">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvslti_wu-__m256i-a-imm0_31-imm">__m256i __lasx_xvslti_wu (__m256i a, imm0_31 imm)</h2>
+<h3 id="synopsis_21">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvslti_wu (__m256i a, imm0_31 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvslti.wu xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_21">Description</h3>
+<p>Compare the unsigned 32-bit elements in <code>a</code> and <code>imm</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than <code>b</code>, zero otherwise.</p>
+<h3 id="operation_21">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = ((u32)a.word[i] &lt; imm) ? 0xFFFFFFFF : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_21">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvslti_d-__m256i-a-imm_n16_15-imm">__m256i __lasx_xvslti_d (__m256i a, imm_n16_15 imm)</h2>
+<h3 id="synopsis_22">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvslti_d (__m256i a, imm_n16_15 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvslti.d xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_22">Description</h3>
+<p>Compare the signed 64-bit elements in <code>a</code> and <code>imm</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than <code>b</code>, zero otherwise.</p>
+<h3 id="operation_22">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = ((s64)a.dword[i] &lt; imm) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_22">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvslti_du-__m256i-a-imm0_31-imm">__m256i __lasx_xvslti_du (__m256i a, imm0_31 imm)</h2>
+<h3 id="synopsis_23">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvslti_du (__m256i a, imm0_31 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvslti.du xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_23">Description</h3>
+<p>Compare the unsigned 64-bit elements in <code>a</code> and <code>imm</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than <code>b</code>, zero otherwise.</p>
+<h3 id="operation_23">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = ((u64)a.dword[i] &lt; imm) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_23">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsle_b-__m256i-a-__m256i-b">__m256i __lasx_xvsle_b (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_24">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsle_b (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsle.b xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_24">Description</h3>
+<p>Compare the signed 8-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than or equal <code>b</code>, zero otherwise.</p>
+<h3 id="operation_24">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 32; i++) {
+  dst.byte[i] = ((s8)a.byte[i] &lt;= (s8)b.byte[i]) ? 0xFF : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_24">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsle_bu-__m256i-a-__m256i-b">__m256i __lasx_xvsle_bu (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_25">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsle_bu (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsle.bu xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_25">Description</h3>
+<p>Compare the unsigned 8-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than or equal <code>b</code>, zero otherwise.</p>
+<h3 id="operation_25">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 32; i++) {
+  dst.byte[i] = ((u8)a.byte[i] &lt;= (u8)b.byte[i]) ? 0xFF : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_25">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsle_h-__m256i-a-__m256i-b">__m256i __lasx_xvsle_h (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_26">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsle_h (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsle.h xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_26">Description</h3>
+<p>Compare the signed 16-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than or equal <code>b</code>, zero otherwise.</p>
+<h3 id="operation_26">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = ((s16)a.half[i] &lt;= (s16)b.half[i]) ? 0xFFFF : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_26">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsle_hu-__m256i-a-__m256i-b">__m256i __lasx_xvsle_hu (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_27">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsle_hu (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsle.hu xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_27">Description</h3>
+<p>Compare the unsigned 16-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than or equal <code>b</code>, zero otherwise.</p>
+<h3 id="operation_27">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = ((u16)a.half[i] &lt;= (u16)b.half[i]) ? 0xFFFF : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_27">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsle_w-__m256i-a-__m256i-b">__m256i __lasx_xvsle_w (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_28">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsle_w (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsle.w xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_28">Description</h3>
+<p>Compare the signed 32-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than or equal <code>b</code>, zero otherwise.</p>
+<h3 id="operation_28">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = ((s32)a.word[i] &lt;= (s32)b.word[i]) ? 0xFFFFFFFF : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_28">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsle_wu-__m256i-a-__m256i-b">__m256i __lasx_xvsle_wu (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_29">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsle_wu (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsle.wu xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_29">Description</h3>
+<p>Compare the unsigned 32-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than or equal <code>b</code>, zero otherwise.</p>
+<h3 id="operation_29">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = ((u32)a.word[i] &lt;= (u32)b.word[i]) ? 0xFFFFFFFF : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_29">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsle_d-__m256i-a-__m256i-b">__m256i __lasx_xvsle_d (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_30">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsle_d (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsle.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_30">Description</h3>
+<p>Compare the signed 64-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than or equal <code>b</code>, zero otherwise.</p>
+<h3 id="operation_30">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = ((s64)a.dword[i] &lt;= (s64)b.dword[i]) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_30">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsle_du-__m256i-a-__m256i-b">__m256i __lasx_xvsle_du (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_31">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsle_du (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsle.du xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_31">Description</h3>
+<p>Compare the unsigned 64-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than or equal <code>b</code>, zero otherwise.</p>
+<h3 id="operation_31">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = ((u64)a.dword[i] &lt;= (u64)b.dword[i]) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_31">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvslei_b-__m256i-a-imm_n16_15-imm">__m256i __lasx_xvslei_b (__m256i a, imm_n16_15 imm)</h2>
+<h3 id="synopsis_32">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvslei_b (__m256i a, imm_n16_15 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvslei.b xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_32">Description</h3>
+<p>Compare the signed 8-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than or equal <code>b</code>, zero otherwise.</p>
+<h3 id="operation_32">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 32; i++) {
+  dst.byte[i] = ((s8)a.byte[i] &lt;= imm) ? 0xFF : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_32">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvslei_bu-__m256i-a-imm0_31-imm">__m256i __lasx_xvslei_bu (__m256i a, imm0_31 imm)</h2>
+<h3 id="synopsis_33">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvslei_bu (__m256i a, imm0_31 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvslei.bu xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_33">Description</h3>
+<p>Compare the unsigned 8-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than or equal <code>b</code>, zero otherwise.</p>
+<h3 id="operation_33">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 32; i++) {
+  dst.byte[i] = ((u8)a.byte[i] &lt;= imm) ? 0xFF : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_33">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvslei_h-__m256i-a-imm_n16_15-imm">__m256i __lasx_xvslei_h (__m256i a, imm_n16_15 imm)</h2>
+<h3 id="synopsis_34">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvslei_h (__m256i a, imm_n16_15 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvslei.h xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_34">Description</h3>
+<p>Compare the signed 16-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than or equal <code>b</code>, zero otherwise.</p>
+<h3 id="operation_34">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = ((s16)a.half[i] &lt;= imm) ? 0xFFFF : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_34">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvslei_hu-__m256i-a-imm0_31-imm">__m256i __lasx_xvslei_hu (__m256i a, imm0_31 imm)</h2>
+<h3 id="synopsis_35">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvslei_hu (__m256i a, imm0_31 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvslei.hu xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_35">Description</h3>
+<p>Compare the unsigned 16-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than or equal <code>b</code>, zero otherwise.</p>
+<h3 id="operation_35">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = ((u16)a.half[i] &lt;= imm) ? 0xFFFF : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_35">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvslei_w-__m256i-a-imm_n16_15-imm">__m256i __lasx_xvslei_w (__m256i a, imm_n16_15 imm)</h2>
+<h3 id="synopsis_36">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvslei_w (__m256i a, imm_n16_15 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvslei.w xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_36">Description</h3>
+<p>Compare the signed 32-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than or equal <code>b</code>, zero otherwise.</p>
+<h3 id="operation_36">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = ((s32)a.word[i] &lt;= imm) ? 0xFFFFFFFF : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_36">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvslei_wu-__m256i-a-imm0_31-imm">__m256i __lasx_xvslei_wu (__m256i a, imm0_31 imm)</h2>
+<h3 id="synopsis_37">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvslei_wu (__m256i a, imm0_31 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvslei.wu xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_37">Description</h3>
+<p>Compare the unsigned 32-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than or equal <code>b</code>, zero otherwise.</p>
+<h3 id="operation_37">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = ((u32)a.word[i] &lt;= imm) ? 0xFFFFFFFF : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_37">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvslei_d-__m256i-a-imm_n16_15-imm">__m256i __lasx_xvslei_d (__m256i a, imm_n16_15 imm)</h2>
+<h3 id="synopsis_38">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvslei_d (__m256i a, imm_n16_15 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvslei.d xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_38">Description</h3>
+<p>Compare the signed 64-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than or equal <code>b</code>, zero otherwise.</p>
+<h3 id="operation_38">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = ((s64)a.dword[i] &lt;= imm) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_38">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvslei_du-__m256i-a-imm0_31-imm">__m256i __lasx_xvslei_du (__m256i a, imm0_31 imm)</h2>
+<h3 id="synopsis_39">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvslei_du (__m256i a, imm0_31 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvslei.du xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_39">Description</h3>
+<p>Compare the unsigned 64-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than or equal <code>b</code>, zero otherwise.</p>
+<h3 id="operation_39">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = ((u64)a.dword[i] &lt;= imm) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_39">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+              
+            </div>
+          </div><footer>
+    <div class="rst-footer-buttons" role="navigation" aria-label="Footer Navigation">
+        <a href="../fma/" class="btn btn-neutral float-left" title="Fused Multiply-Add"><span class="icon icon-circle-arrow-left"></span> Previous</a>
+        <a href="../integer_computation/" class="btn btn-neutral float-right" title="Integer Computation">Next <span class="icon icon-circle-arrow-right"></span></a>
+    </div>
+
+  <hr/>
+
+  <div role="contentinfo">
+    <!-- Copyright etc -->
+      <p>Copyright &copy; 2023 Jiajie Chen</p>
+  </div>
+
+  Built with <a href="https://www.mkdocs.org/">MkDocs</a> using a <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
+</footer>
+          
+        </div>
+      </div>
+
+    </section>
+
+  </div>
+
+  <div class="rst-versions" role="note" aria-label="Versions">
+  <span class="rst-current-version" data-toggle="rst-current-version">
+    
+    
+      <span><a href="../fma/" style="color: #fcfcfc">&laquo; Previous</a></span>
+    
+    
+      <span><a href="../integer_computation/" style="color: #fcfcfc">Next &raquo;</a></span>
+    
+  </span>
+</div>
+    <script src="../../js/jquery-3.6.0.min.js"></script>
+    <script>var base_url = "../..";</script>
+    <script src="../../js/theme_extra.js"></script>
+    <script src="../../js/theme.js"></script>
+    <script>
+        jQuery(function () {
+            SphinxRtdTheme.Navigation.enable(true);
+        });
+    </script>
+
+</body>
+</html>
diff --git a/lasx/integer_computation/index.html b/lasx/integer_computation/index.html
new file mode 100644
index 00000000..c12b71cb
--- /dev/null
+++ b/lasx/integer_computation/index.html
@@ -0,0 +1,11911 @@
+<!DOCTYPE html>
+<html class="writer-html5" lang="en" >
+<head>
+    <meta charset="utf-8" />
+    <meta http-equiv="X-UA-Compatible" content="IE=edge" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" /><link rel="canonical" href="https://jia.je/unofficial-loongarch-intrinsics-guide/lasx/integer_computation/" />
+      <link rel="shortcut icon" href="../../img/favicon.ico" />
+    <title>Integer Computation - Unofficial LoongArch Intrinsics Guide</title>
+    <link rel="stylesheet" href="../../css/theme.css" />
+    <link rel="stylesheet" href="../../css/theme_extra.css" />
+        <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.8.0/styles/github.min.css" />
+        <link href="../../main.css" rel="stylesheet" />
+    
+      <script>
+        // Current page data
+        var mkdocs_page_name = "Integer Computation";
+        var mkdocs_page_input_path = "lasx/integer_computation.md";
+        var mkdocs_page_url = "/unofficial-loongarch-intrinsics-guide/lasx/integer_computation/";
+      </script>
+    
+    <!--[if lt IE 9]>
+      <script src="../../js/html5shiv.min.js"></script>
+    <![endif]-->
+      <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.8.0/highlight.min.js"></script>
+      <script>hljs.highlightAll();</script> 
+</head>
+
+<body class="wy-body-for-nav" role="document">
+
+  <div class="wy-grid-for-nav">
+    <nav data-toggle="wy-nav-shift" class="wy-nav-side stickynav">
+    <div class="wy-side-scroll">
+      <div class="wy-side-nav-search">
+          <a href="../.." class="icon icon-home"> Unofficial LoongArch Intrinsics Guide
+        </a>
+      </div>
+
+      <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../..">Unofficial LoongArch Intrinsics Guide</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../latency_throughput/">Latency and Throughput of Instructions</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../migrating_avx/">Migrating from AVX to LASX</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../migrating_sse/">Migrating from SSE to LSX</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../viewer/">Browse All Intrinsics</a>
+                </li>
+              </ul>
+              <p class="caption"><span class="caption-text">Lasx</span></p>
+              <ul class="current">
+                  <li class="toctree-l1"><a class="reference internal" href="../bitwise_operations/">Bitwise Operations</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../branch/">Branch</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_comparison/">Floating Point Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_computation/">Floating Point Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_conversion/">Floating Point Conversion</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_misc/">Floating Point Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../fma/">Fused Multiply-Add</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../integer_comparison/">Integer Comparison</a>
+                  </li>
+                  <li class="toctree-l1 current"><a class="reference internal current" href="./">Integer Computation</a>
+    <ul class="current">
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvadd_b-__m256i-a-__m256i-b">__m256i __lasx_xvadd_b (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvadd_h-__m256i-a-__m256i-b">__m256i __lasx_xvadd_h (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_1">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_1">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_1">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_1">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvadd_w-__m256i-a-__m256i-b">__m256i __lasx_xvadd_w (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_2">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_2">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_2">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_2">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvadd_d-__m256i-a-__m256i-b">__m256i __lasx_xvadd_d (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_3">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_3">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_3">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_3">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvadd_q-__m256i-a-__m256i-b">__m256i __lasx_xvadd_q (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_4">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_4">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_4">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_4">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvabsd_b-__m256i-a-__m256i-b">__m256i __lasx_xvabsd_b (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_5">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_5">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_5">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_5">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvabsd_bu-__m256i-a-__m256i-b">__m256i __lasx_xvabsd_bu (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_6">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_6">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_6">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_6">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvabsd_h-__m256i-a-__m256i-b">__m256i __lasx_xvabsd_h (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_7">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_7">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_7">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_7">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvabsd_hu-__m256i-a-__m256i-b">__m256i __lasx_xvabsd_hu (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_8">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_8">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_8">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_8">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvabsd_w-__m256i-a-__m256i-b">__m256i __lasx_xvabsd_w (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_9">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_9">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_9">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_9">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvabsd_wu-__m256i-a-__m256i-b">__m256i __lasx_xvabsd_wu (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_10">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_10">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_10">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_10">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvabsd_d-__m256i-a-__m256i-b">__m256i __lasx_xvabsd_d (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_11">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_11">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_11">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_11">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvabsd_du-__m256i-a-__m256i-b">__m256i __lasx_xvabsd_du (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_12">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_12">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_12">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_12">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvadda_b-__m256i-a-__m256i-b">__m256i __lasx_xvadda_b (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_13">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_13">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_13">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_13">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvadda_h-__m256i-a-__m256i-b">__m256i __lasx_xvadda_h (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_14">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_14">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_14">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_14">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvadda_w-__m256i-a-__m256i-b">__m256i __lasx_xvadda_w (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_15">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_15">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_15">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_15">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvadda_d-__m256i-a-__m256i-b">__m256i __lasx_xvadda_d (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_16">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_16">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_16">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_16">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvaddi_bu-__m256i-a-imm0_31-imm">__m256i __lasx_xvaddi_bu (__m256i a, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_17">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_17">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_17">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_17">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvaddi_hu-__m256i-a-imm0_31-imm">__m256i __lasx_xvaddi_hu (__m256i a, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_18">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_18">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_18">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_18">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvaddi_wu-__m256i-a-imm0_31-imm">__m256i __lasx_xvaddi_wu (__m256i a, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_19">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_19">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_19">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_19">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvaddi_du-__m256i-a-imm0_31-imm">__m256i __lasx_xvaddi_du (__m256i a, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_20">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_20">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_20">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_20">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvaddwev_h_b-__m256i-a-__m256i-b">__m256i __lasx_xvaddwev_h_b (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_21">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_21">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_21">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_21">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvaddwev_h_bu-__m256i-a-__m256i-b">__m256i __lasx_xvaddwev_h_bu (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_22">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_22">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_22">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_22">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvaddwev_h_bu_b-__m256i-a-__m256i-b">__m256i __lasx_xvaddwev_h_bu_b (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_23">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_23">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_23">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_23">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvaddwev_w_h-__m256i-a-__m256i-b">__m256i __lasx_xvaddwev_w_h (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_24">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_24">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_24">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_24">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvaddwev_w_hu-__m256i-a-__m256i-b">__m256i __lasx_xvaddwev_w_hu (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_25">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_25">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_25">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_25">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvaddwev_w_hu_h-__m256i-a-__m256i-b">__m256i __lasx_xvaddwev_w_hu_h (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_26">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_26">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_26">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_26">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvaddwev_d_w-__m256i-a-__m256i-b">__m256i __lasx_xvaddwev_d_w (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_27">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_27">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_27">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_27">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvaddwev_d_wu-__m256i-a-__m256i-b">__m256i __lasx_xvaddwev_d_wu (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_28">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_28">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_28">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_28">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvaddwev_d_wu_w-__m256i-a-__m256i-b">__m256i __lasx_xvaddwev_d_wu_w (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_29">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_29">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_29">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_29">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvaddwev_q_d-__m256i-a-__m256i-b">__m256i __lasx_xvaddwev_q_d (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_30">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_30">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_30">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_30">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvaddwev_q_du-__m256i-a-__m256i-b">__m256i __lasx_xvaddwev_q_du (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_31">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_31">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_31">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_31">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvaddwev_q_du_d-__m256i-a-__m256i-b">__m256i __lasx_xvaddwev_q_du_d (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_32">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_32">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_32">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_32">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvaddwod_h_b-__m256i-a-__m256i-b">__m256i __lasx_xvaddwod_h_b (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_33">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_33">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_33">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_33">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvaddwod_h_bu-__m256i-a-__m256i-b">__m256i __lasx_xvaddwod_h_bu (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_34">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_34">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_34">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_34">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvaddwod_h_bu_b-__m256i-a-__m256i-b">__m256i __lasx_xvaddwod_h_bu_b (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_35">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_35">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_35">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_35">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvaddwod_w_h-__m256i-a-__m256i-b">__m256i __lasx_xvaddwod_w_h (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_36">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_36">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_36">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_36">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvaddwod_w_hu-__m256i-a-__m256i-b">__m256i __lasx_xvaddwod_w_hu (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_37">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_37">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_37">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_37">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvaddwod_w_hu_h-__m256i-a-__m256i-b">__m256i __lasx_xvaddwod_w_hu_h (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_38">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_38">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_38">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_38">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvaddwod_d_w-__m256i-a-__m256i-b">__m256i __lasx_xvaddwod_d_w (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_39">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_39">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_39">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_39">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvaddwod_d_wu-__m256i-a-__m256i-b">__m256i __lasx_xvaddwod_d_wu (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_40">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_40">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_40">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_40">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvaddwod_d_wu_w-__m256i-a-__m256i-b">__m256i __lasx_xvaddwod_d_wu_w (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_41">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_41">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_41">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_41">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvaddwod_q_d-__m256i-a-__m256i-b">__m256i __lasx_xvaddwod_q_d (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_42">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_42">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_42">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_42">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvaddwod_q_du-__m256i-a-__m256i-b">__m256i __lasx_xvaddwod_q_du (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_43">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_43">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_43">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_43">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvaddwod_q_du_d-__m256i-a-__m256i-b">__m256i __lasx_xvaddwod_q_du_d (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_44">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_44">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_44">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_44">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvavg_b-__m256i-a-__m256i-b">__m256i __lasx_xvavg_b (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_45">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_45">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_45">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_45">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvavg_bu-__m256i-a-__m256i-b">__m256i __lasx_xvavg_bu (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_46">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_46">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_46">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_46">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvavg_h-__m256i-a-__m256i-b">__m256i __lasx_xvavg_h (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_47">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_47">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_47">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_47">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvavg_hu-__m256i-a-__m256i-b">__m256i __lasx_xvavg_hu (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_48">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_48">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_48">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_48">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvavg_w-__m256i-a-__m256i-b">__m256i __lasx_xvavg_w (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_49">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_49">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_49">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_49">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvavg_wu-__m256i-a-__m256i-b">__m256i __lasx_xvavg_wu (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_50">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_50">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_50">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_50">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvavg_d-__m256i-a-__m256i-b">__m256i __lasx_xvavg_d (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_51">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_51">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_51">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_51">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvavg_du-__m256i-a-__m256i-b">__m256i __lasx_xvavg_du (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_52">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_52">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_52">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_52">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvavgr_b-__m256i-a-__m256i-b">__m256i __lasx_xvavgr_b (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_53">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_53">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_53">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_53">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvavgr_bu-__m256i-a-__m256i-b">__m256i __lasx_xvavgr_bu (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_54">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_54">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_54">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_54">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvavgr_h-__m256i-a-__m256i-b">__m256i __lasx_xvavgr_h (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_55">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_55">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_55">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_55">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvavgr_hu-__m256i-a-__m256i-b">__m256i __lasx_xvavgr_hu (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_56">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_56">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_56">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_56">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvavgr_w-__m256i-a-__m256i-b">__m256i __lasx_xvavgr_w (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_57">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_57">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_57">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_57">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvavgr_wu-__m256i-a-__m256i-b">__m256i __lasx_xvavgr_wu (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_58">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_58">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_58">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_58">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvavgr_d-__m256i-a-__m256i-b">__m256i __lasx_xvavgr_d (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_59">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_59">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_59">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_59">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvavgr_du-__m256i-a-__m256i-b">__m256i __lasx_xvavgr_du (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_60">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_60">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_60">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_60">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvdiv_b-__m256i-a-__m256i-b">__m256i __lasx_xvdiv_b (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_61">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_61">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_61">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_61">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvdiv_bu-__m256i-a-__m256i-b">__m256i __lasx_xvdiv_bu (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_62">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_62">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_62">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_62">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvdiv_h-__m256i-a-__m256i-b">__m256i __lasx_xvdiv_h (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_63">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_63">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_63">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_63">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvdiv_hu-__m256i-a-__m256i-b">__m256i __lasx_xvdiv_hu (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_64">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_64">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_64">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_64">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvdiv_w-__m256i-a-__m256i-b">__m256i __lasx_xvdiv_w (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_65">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_65">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_65">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_65">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvdiv_wu-__m256i-a-__m256i-b">__m256i __lasx_xvdiv_wu (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_66">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_66">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_66">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_66">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvdiv_d-__m256i-a-__m256i-b">__m256i __lasx_xvdiv_d (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_67">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_67">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_67">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_67">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvdiv_du-__m256i-a-__m256i-b">__m256i __lasx_xvdiv_du (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_68">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_68">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_68">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_68">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvhaddw_h_b-__m256i-a-__m256i-b">__m256i __lasx_xvhaddw_h_b (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_69">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_69">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_69">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_69">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvhaddw_hu_bu-__m256i-a-__m256i-b">__m256i __lasx_xvhaddw_hu_bu (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_70">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_70">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_70">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_70">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvhaddw_w_h-__m256i-a-__m256i-b">__m256i __lasx_xvhaddw_w_h (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_71">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_71">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_71">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_71">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvhaddw_wu_hu-__m256i-a-__m256i-b">__m256i __lasx_xvhaddw_wu_hu (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_72">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_72">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_72">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_72">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvhaddw_d_w-__m256i-a-__m256i-b">__m256i __lasx_xvhaddw_d_w (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_73">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_73">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_73">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_73">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvhaddw_du_wu-__m256i-a-__m256i-b">__m256i __lasx_xvhaddw_du_wu (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_74">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_74">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_74">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_74">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvhaddw_q_d-__m256i-a-__m256i-b">__m256i __lasx_xvhaddw_q_d (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_75">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_75">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_75">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_75">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvhaddw_qu_du-__m256i-a-__m256i-b">__m256i __lasx_xvhaddw_qu_du (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_76">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_76">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_76">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_76">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvhsubw_h_b-__m256i-a-__m256i-b">__m256i __lasx_xvhsubw_h_b (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_77">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_77">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_77">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_77">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvhsubw_hu_bu-__m256i-a-__m256i-b">__m256i __lasx_xvhsubw_hu_bu (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_78">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_78">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_78">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_78">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvhsubw_w_h-__m256i-a-__m256i-b">__m256i __lasx_xvhsubw_w_h (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_79">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_79">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_79">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_79">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvhsubw_wu_hu-__m256i-a-__m256i-b">__m256i __lasx_xvhsubw_wu_hu (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_80">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_80">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_80">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_80">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvhsubw_d_w-__m256i-a-__m256i-b">__m256i __lasx_xvhsubw_d_w (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_81">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_81">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_81">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_81">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvhsubw_du_wu-__m256i-a-__m256i-b">__m256i __lasx_xvhsubw_du_wu (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_82">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_82">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_82">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_82">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvhsubw_q_d-__m256i-a-__m256i-b">__m256i __lasx_xvhsubw_q_d (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_83">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_83">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_83">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_83">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvhsubw_qu_du-__m256i-a-__m256i-b">__m256i __lasx_xvhsubw_qu_du (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_84">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_84">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_84">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_84">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmadd_b-__m256i-a-__m256i-b-__m256i-c">__m256i __lasx_xvmadd_b (__m256i a, __m256i b, __m256i c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_85">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_85">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_85">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_85">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmadd_h-__m256i-a-__m256i-b-__m256i-c">__m256i __lasx_xvmadd_h (__m256i a, __m256i b, __m256i c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_86">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_86">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_86">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_86">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmadd_w-__m256i-a-__m256i-b-__m256i-c">__m256i __lasx_xvmadd_w (__m256i a, __m256i b, __m256i c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_87">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_87">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_87">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_87">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmadd_d-__m256i-a-__m256i-b-__m256i-c">__m256i __lasx_xvmadd_d (__m256i a, __m256i b, __m256i c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_88">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_88">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_88">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_88">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmaddwev_h_b-__m256i-a-__m256i-b-__m256i-c">__m256i __lasx_xvmaddwev_h_b (__m256i a, __m256i b, __m256i c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_89">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_89">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_89">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_89">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmaddwev_h_bu-__m256i-a-__m256i-b-__m256i-c">__m256i __lasx_xvmaddwev_h_bu (__m256i a, __m256i b, __m256i c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_90">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_90">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_90">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_90">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmaddwev_h_bu_b-__m256i-a-__m256i-b-__m256i-c">__m256i __lasx_xvmaddwev_h_bu_b (__m256i a, __m256i b, __m256i c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_91">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_91">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_91">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_91">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmaddwev_w_h-__m256i-a-__m256i-b-__m256i-c">__m256i __lasx_xvmaddwev_w_h (__m256i a, __m256i b, __m256i c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_92">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_92">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_92">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_92">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmaddwev_w_hu-__m256i-a-__m256i-b-__m256i-c">__m256i __lasx_xvmaddwev_w_hu (__m256i a, __m256i b, __m256i c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_93">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_93">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_93">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_93">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmaddwev_w_hu_h-__m256i-a-__m256i-b-__m256i-c">__m256i __lasx_xvmaddwev_w_hu_h (__m256i a, __m256i b, __m256i c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_94">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_94">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_94">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_94">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmaddwev_d_w-__m256i-a-__m256i-b-__m256i-c">__m256i __lasx_xvmaddwev_d_w (__m256i a, __m256i b, __m256i c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_95">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_95">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_95">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_95">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmaddwev_d_wu-__m256i-a-__m256i-b-__m256i-c">__m256i __lasx_xvmaddwev_d_wu (__m256i a, __m256i b, __m256i c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_96">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_96">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_96">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_96">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmaddwev_d_wu_w-__m256i-a-__m256i-b-__m256i-c">__m256i __lasx_xvmaddwev_d_wu_w (__m256i a, __m256i b, __m256i c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_97">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_97">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_97">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_97">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmaddwev_q_d-__m256i-a-__m256i-b-__m256i-c">__m256i __lasx_xvmaddwev_q_d (__m256i a, __m256i b, __m256i c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_98">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_98">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_98">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_98">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmaddwev_q_du-__m256i-a-__m256i-b-__m256i-c">__m256i __lasx_xvmaddwev_q_du (__m256i a, __m256i b, __m256i c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_99">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_99">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_99">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_99">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmaddwev_q_du_d-__m256i-a-__m256i-b-__m256i-c">__m256i __lasx_xvmaddwev_q_du_d (__m256i a, __m256i b, __m256i c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_100">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_100">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_100">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_100">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmaddwod_h_b-__m256i-a-__m256i-b-__m256i-c">__m256i __lasx_xvmaddwod_h_b (__m256i a, __m256i b, __m256i c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_101">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_101">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_101">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_101">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmaddwod_h_bu-__m256i-a-__m256i-b-__m256i-c">__m256i __lasx_xvmaddwod_h_bu (__m256i a, __m256i b, __m256i c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_102">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_102">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_102">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_102">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmaddwod_h_bu_b-__m256i-a-__m256i-b-__m256i-c">__m256i __lasx_xvmaddwod_h_bu_b (__m256i a, __m256i b, __m256i c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_103">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_103">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_103">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_103">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmaddwod_w_h-__m256i-a-__m256i-b-__m256i-c">__m256i __lasx_xvmaddwod_w_h (__m256i a, __m256i b, __m256i c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_104">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_104">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_104">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_104">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmaddwod_w_hu-__m256i-a-__m256i-b-__m256i-c">__m256i __lasx_xvmaddwod_w_hu (__m256i a, __m256i b, __m256i c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_105">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_105">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_105">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_105">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmaddwod_w_hu_h-__m256i-a-__m256i-b-__m256i-c">__m256i __lasx_xvmaddwod_w_hu_h (__m256i a, __m256i b, __m256i c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_106">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_106">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_106">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_106">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmaddwod_d_w-__m256i-a-__m256i-b-__m256i-c">__m256i __lasx_xvmaddwod_d_w (__m256i a, __m256i b, __m256i c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_107">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_107">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_107">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_107">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmaddwod_d_wu-__m256i-a-__m256i-b-__m256i-c">__m256i __lasx_xvmaddwod_d_wu (__m256i a, __m256i b, __m256i c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_108">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_108">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_108">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_108">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmaddwod_d_wu_w-__m256i-a-__m256i-b-__m256i-c">__m256i __lasx_xvmaddwod_d_wu_w (__m256i a, __m256i b, __m256i c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_109">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_109">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_109">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_109">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmaddwod_q_d-__m256i-a-__m256i-b-__m256i-c">__m256i __lasx_xvmaddwod_q_d (__m256i a, __m256i b, __m256i c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_110">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_110">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_110">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_110">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmaddwod_q_du-__m256i-a-__m256i-b-__m256i-c">__m256i __lasx_xvmaddwod_q_du (__m256i a, __m256i b, __m256i c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_111">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_111">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_111">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_111">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmaddwod_q_du_d-__m256i-a-__m256i-b-__m256i-c">__m256i __lasx_xvmaddwod_q_du_d (__m256i a, __m256i b, __m256i c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_112">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_112">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_112">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_112">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmax_b-__m256i-a-__m256i-b">__m256i __lasx_xvmax_b (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_113">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_113">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_113">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_113">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmax_bu-__m256i-a-__m256i-b">__m256i __lasx_xvmax_bu (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_114">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_114">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_114">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_114">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmax_h-__m256i-a-__m256i-b">__m256i __lasx_xvmax_h (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_115">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_115">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_115">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_115">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmax_hu-__m256i-a-__m256i-b">__m256i __lasx_xvmax_hu (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_116">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_116">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_116">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_116">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmax_w-__m256i-a-__m256i-b">__m256i __lasx_xvmax_w (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_117">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_117">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_117">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_117">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmax_wu-__m256i-a-__m256i-b">__m256i __lasx_xvmax_wu (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_118">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_118">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_118">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_118">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmax_d-__m256i-a-__m256i-b">__m256i __lasx_xvmax_d (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_119">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_119">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_119">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_119">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmax_du-__m256i-a-__m256i-b">__m256i __lasx_xvmax_du (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_120">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_120">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_120">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_120">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmaxi_b-__m256i-a-imm_n16_15-imm">__m256i __lasx_xvmaxi_b (__m256i a, imm_n16_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_121">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_121">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_121">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_121">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmaxi_bu-__m256i-a-imm0_31-imm">__m256i __lasx_xvmaxi_bu (__m256i a, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_122">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_122">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_122">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_122">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmaxi_h-__m256i-a-imm_n16_15-imm">__m256i __lasx_xvmaxi_h (__m256i a, imm_n16_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_123">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_123">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_123">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_123">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmaxi_hu-__m256i-a-imm0_31-imm">__m256i __lasx_xvmaxi_hu (__m256i a, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_124">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_124">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_124">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_124">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmaxi_w-__m256i-a-imm_n16_15-imm">__m256i __lasx_xvmaxi_w (__m256i a, imm_n16_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_125">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_125">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_125">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_125">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmaxi_wu-__m256i-a-imm0_31-imm">__m256i __lasx_xvmaxi_wu (__m256i a, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_126">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_126">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_126">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_126">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmaxi_d-__m256i-a-imm_n16_15-imm">__m256i __lasx_xvmaxi_d (__m256i a, imm_n16_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_127">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_127">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_127">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_127">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmaxi_du-__m256i-a-imm0_31-imm">__m256i __lasx_xvmaxi_du (__m256i a, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_128">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_128">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_128">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_128">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmin_b-__m256i-a-__m256i-b">__m256i __lasx_xvmin_b (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_129">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_129">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_129">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_129">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmin_bu-__m256i-a-__m256i-b">__m256i __lasx_xvmin_bu (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_130">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_130">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_130">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_130">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmin_h-__m256i-a-__m256i-b">__m256i __lasx_xvmin_h (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_131">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_131">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_131">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_131">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmin_hu-__m256i-a-__m256i-b">__m256i __lasx_xvmin_hu (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_132">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_132">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_132">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_132">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmin_w-__m256i-a-__m256i-b">__m256i __lasx_xvmin_w (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_133">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_133">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_133">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_133">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmin_wu-__m256i-a-__m256i-b">__m256i __lasx_xvmin_wu (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_134">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_134">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_134">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_134">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmin_d-__m256i-a-__m256i-b">__m256i __lasx_xvmin_d (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_135">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_135">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_135">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_135">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmin_du-__m256i-a-__m256i-b">__m256i __lasx_xvmin_du (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_136">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_136">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_136">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_136">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmini_b-__m256i-a-imm_n16_15-imm">__m256i __lasx_xvmini_b (__m256i a, imm_n16_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_137">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_137">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_137">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_137">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmini_bu-__m256i-a-imm0_31-imm">__m256i __lasx_xvmini_bu (__m256i a, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_138">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_138">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_138">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_138">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmini_h-__m256i-a-imm_n16_15-imm">__m256i __lasx_xvmini_h (__m256i a, imm_n16_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_139">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_139">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_139">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_139">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmini_hu-__m256i-a-imm0_31-imm">__m256i __lasx_xvmini_hu (__m256i a, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_140">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_140">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_140">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_140">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmini_w-__m256i-a-imm_n16_15-imm">__m256i __lasx_xvmini_w (__m256i a, imm_n16_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_141">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_141">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_141">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_141">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmini_wu-__m256i-a-imm0_31-imm">__m256i __lasx_xvmini_wu (__m256i a, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_142">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_142">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_142">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_142">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmini_d-__m256i-a-imm_n16_15-imm">__m256i __lasx_xvmini_d (__m256i a, imm_n16_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_143">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_143">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_143">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_143">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmini_du-__m256i-a-imm0_31-imm">__m256i __lasx_xvmini_du (__m256i a, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_144">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_144">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_144">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_144">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmod_b-__m256i-a-__m256i-b">__m256i __lasx_xvmod_b (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_145">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_145">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_145">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_145">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmod_bu-__m256i-a-__m256i-b">__m256i __lasx_xvmod_bu (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_146">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_146">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_146">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_146">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmod_h-__m256i-a-__m256i-b">__m256i __lasx_xvmod_h (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_147">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_147">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_147">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_147">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmod_hu-__m256i-a-__m256i-b">__m256i __lasx_xvmod_hu (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_148">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_148">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_148">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_148">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmod_w-__m256i-a-__m256i-b">__m256i __lasx_xvmod_w (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_149">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_149">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_149">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_149">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmod_wu-__m256i-a-__m256i-b">__m256i __lasx_xvmod_wu (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_150">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_150">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_150">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_150">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmod_d-__m256i-a-__m256i-b">__m256i __lasx_xvmod_d (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_151">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_151">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_151">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_151">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmod_du-__m256i-a-__m256i-b">__m256i __lasx_xvmod_du (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_152">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_152">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_152">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_152">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmsub_b-__m256i-a-__m256i-b-__m256i-c">__m256i __lasx_xvmsub_b (__m256i a, __m256i b, __m256i c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_153">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_153">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_153">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_153">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmsub_h-__m256i-a-__m256i-b-__m256i-c">__m256i __lasx_xvmsub_h (__m256i a, __m256i b, __m256i c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_154">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_154">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_154">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_154">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmsub_w-__m256i-a-__m256i-b-__m256i-c">__m256i __lasx_xvmsub_w (__m256i a, __m256i b, __m256i c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_155">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_155">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_155">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_155">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmsub_d-__m256i-a-__m256i-b-__m256i-c">__m256i __lasx_xvmsub_d (__m256i a, __m256i b, __m256i c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_156">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_156">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_156">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_156">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmuh_b-__m256i-a-__m256i-b">__m256i __lasx_xvmuh_b (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_157">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_157">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_157">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_157">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmuh_bu-__m256i-a-__m256i-b">__m256i __lasx_xvmuh_bu (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_158">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_158">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_158">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_158">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmuh_h-__m256i-a-__m256i-b">__m256i __lasx_xvmuh_h (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_159">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_159">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_159">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_159">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmuh_hu-__m256i-a-__m256i-b">__m256i __lasx_xvmuh_hu (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_160">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_160">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_160">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_160">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmuh_w-__m256i-a-__m256i-b">__m256i __lasx_xvmuh_w (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_161">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_161">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_161">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_161">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmuh_wu-__m256i-a-__m256i-b">__m256i __lasx_xvmuh_wu (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_162">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_162">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_162">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_162">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmuh_d-__m256i-a-__m256i-b">__m256i __lasx_xvmuh_d (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_163">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_163">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_163">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_163">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmuh_du-__m256i-a-__m256i-b">__m256i __lasx_xvmuh_du (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_164">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_164">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_164">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_164">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmul_b-__m256i-a-__m256i-b">__m256i __lasx_xvmul_b (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_165">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_165">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_165">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_165">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmul_h-__m256i-a-__m256i-b">__m256i __lasx_xvmul_h (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_166">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_166">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_166">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_166">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmul_w-__m256i-a-__m256i-b">__m256i __lasx_xvmul_w (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_167">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_167">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_167">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_167">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmul_d-__m256i-a-__m256i-b">__m256i __lasx_xvmul_d (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_168">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_168">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_168">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_168">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmulwev_h_b-__m256i-a-__m256i-b">__m256i __lasx_xvmulwev_h_b (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_169">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_169">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_169">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_169">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmulwev_h_bu-__m256i-a-__m256i-b">__m256i __lasx_xvmulwev_h_bu (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_170">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_170">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_170">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_170">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmulwev_h_bu_b-__m256i-a-__m256i-b">__m256i __lasx_xvmulwev_h_bu_b (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_171">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_171">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_171">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_171">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmulwev_w_h-__m256i-a-__m256i-b">__m256i __lasx_xvmulwev_w_h (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_172">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_172">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_172">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_172">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmulwev_w_hu-__m256i-a-__m256i-b">__m256i __lasx_xvmulwev_w_hu (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_173">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_173">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_173">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_173">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmulwev_w_hu_h-__m256i-a-__m256i-b">__m256i __lasx_xvmulwev_w_hu_h (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_174">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_174">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_174">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_174">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmulwev_d_w-__m256i-a-__m256i-b">__m256i __lasx_xvmulwev_d_w (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_175">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_175">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_175">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_175">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmulwev_d_wu-__m256i-a-__m256i-b">__m256i __lasx_xvmulwev_d_wu (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_176">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_176">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_176">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_176">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmulwev_d_wu_w-__m256i-a-__m256i-b">__m256i __lasx_xvmulwev_d_wu_w (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_177">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_177">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_177">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_177">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmulwev_q_d-__m256i-a-__m256i-b">__m256i __lasx_xvmulwev_q_d (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_178">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_178">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_178">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_178">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmulwev_q_du-__m256i-a-__m256i-b">__m256i __lasx_xvmulwev_q_du (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_179">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_179">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_179">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_179">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmulwev_q_du_d-__m256i-a-__m256i-b">__m256i __lasx_xvmulwev_q_du_d (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_180">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_180">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_180">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_180">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmulwod_h_b-__m256i-a-__m256i-b">__m256i __lasx_xvmulwod_h_b (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_181">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_181">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_181">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_181">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmulwod_h_bu-__m256i-a-__m256i-b">__m256i __lasx_xvmulwod_h_bu (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_182">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_182">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_182">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_182">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmulwod_h_bu_b-__m256i-a-__m256i-b">__m256i __lasx_xvmulwod_h_bu_b (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_183">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_183">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_183">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_183">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmulwod_w_h-__m256i-a-__m256i-b">__m256i __lasx_xvmulwod_w_h (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_184">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_184">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_184">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_184">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmulwod_w_hu-__m256i-a-__m256i-b">__m256i __lasx_xvmulwod_w_hu (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_185">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_185">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_185">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_185">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmulwod_w_hu_h-__m256i-a-__m256i-b">__m256i __lasx_xvmulwod_w_hu_h (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_186">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_186">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_186">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_186">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmulwod_d_w-__m256i-a-__m256i-b">__m256i __lasx_xvmulwod_d_w (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_187">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_187">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_187">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_187">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmulwod_d_wu-__m256i-a-__m256i-b">__m256i __lasx_xvmulwod_d_wu (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_188">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_188">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_188">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_188">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmulwod_d_wu_w-__m256i-a-__m256i-b">__m256i __lasx_xvmulwod_d_wu_w (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_189">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_189">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_189">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_189">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmulwod_q_d-__m256i-a-__m256i-b">__m256i __lasx_xvmulwod_q_d (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_190">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_190">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_190">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_190">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmulwod_q_du-__m256i-a-__m256i-b">__m256i __lasx_xvmulwod_q_du (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_191">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_191">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_191">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_191">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmulwod_q_du_d-__m256i-a-__m256i-b">__m256i __lasx_xvmulwod_q_du_d (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_192">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_192">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_192">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_192">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvneg_b-__m256i-a">__m256i __lasx_xvneg_b (__m256i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_193">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_193">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_193">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_193">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvneg_h-__m256i-a">__m256i __lasx_xvneg_h (__m256i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_194">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_194">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_194">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_194">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvneg_w-__m256i-a">__m256i __lasx_xvneg_w (__m256i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_195">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_195">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_195">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_195">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvneg_d-__m256i-a">__m256i __lasx_xvneg_d (__m256i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_196">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_196">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_196">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_196">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsadd_b-__m256i-a-__m256i-b">__m256i __lasx_xvsadd_b (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_197">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_197">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_197">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_197">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsadd_bu-__m256i-a-__m256i-b">__m256i __lasx_xvsadd_bu (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_198">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_198">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_198">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_198">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsadd_h-__m256i-a-__m256i-b">__m256i __lasx_xvsadd_h (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_199">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_199">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_199">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_199">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsadd_hu-__m256i-a-__m256i-b">__m256i __lasx_xvsadd_hu (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_200">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_200">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_200">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_200">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsadd_w-__m256i-a-__m256i-b">__m256i __lasx_xvsadd_w (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_201">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_201">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_201">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_201">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsadd_wu-__m256i-a-__m256i-b">__m256i __lasx_xvsadd_wu (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_202">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_202">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_202">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_202">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsadd_d-__m256i-a-__m256i-b">__m256i __lasx_xvsadd_d (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_203">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_203">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_203">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_203">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsadd_du-__m256i-a-__m256i-b">__m256i __lasx_xvsadd_du (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_204">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_204">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_204">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_204">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvssub_b-__m256i-a-__m256i-b">__m256i __lasx_xvssub_b (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_205">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_205">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_205">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_205">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvssub_bu-__m256i-a-__m256i-b">__m256i __lasx_xvssub_bu (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_206">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_206">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_206">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_206">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvssub_h-__m256i-a-__m256i-b">__m256i __lasx_xvssub_h (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_207">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_207">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_207">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_207">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvssub_hu-__m256i-a-__m256i-b">__m256i __lasx_xvssub_hu (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_208">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_208">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_208">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_208">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvssub_w-__m256i-a-__m256i-b">__m256i __lasx_xvssub_w (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_209">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_209">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_209">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_209">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvssub_wu-__m256i-a-__m256i-b">__m256i __lasx_xvssub_wu (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_210">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_210">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_210">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_210">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvssub_d-__m256i-a-__m256i-b">__m256i __lasx_xvssub_d (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_211">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_211">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_211">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_211">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvssub_du-__m256i-a-__m256i-b">__m256i __lasx_xvssub_du (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_212">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_212">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_212">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_212">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsub_b-__m256i-a-__m256i-b">__m256i __lasx_xvsub_b (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_213">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_213">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_213">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_213">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsub_h-__m256i-a-__m256i-b">__m256i __lasx_xvsub_h (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_214">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_214">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_214">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_214">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsub_w-__m256i-a-__m256i-b">__m256i __lasx_xvsub_w (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_215">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_215">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_215">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_215">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsub_d-__m256i-a-__m256i-b">__m256i __lasx_xvsub_d (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_216">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_216">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_216">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_216">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsub_q-__m256i-a-__m256i-b">__m256i __lasx_xvsub_q (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_217">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_217">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_217">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_217">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsubi_bu-__m256i-a-imm0_31-imm">__m256i __lasx_xvsubi_bu (__m256i a, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_218">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_218">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_218">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_218">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsubi_hu-__m256i-a-imm0_31-imm">__m256i __lasx_xvsubi_hu (__m256i a, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_219">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_219">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_219">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_219">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsubi_wu-__m256i-a-imm0_31-imm">__m256i __lasx_xvsubi_wu (__m256i a, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_220">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_220">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_220">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_220">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsubi_du-__m256i-a-imm0_31-imm">__m256i __lasx_xvsubi_du (__m256i a, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_221">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_221">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_221">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_221">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsubwev_h_b-__m256i-a-__m256i-b">__m256i __lasx_xvsubwev_h_b (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_222">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_222">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_222">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_222">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsubwev_h_bu-__m256i-a-__m256i-b">__m256i __lasx_xvsubwev_h_bu (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_223">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_223">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_223">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_223">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsubwev_w_h-__m256i-a-__m256i-b">__m256i __lasx_xvsubwev_w_h (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_224">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_224">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_224">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_224">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsubwev_w_hu-__m256i-a-__m256i-b">__m256i __lasx_xvsubwev_w_hu (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_225">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_225">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_225">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_225">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsubwev_d_w-__m256i-a-__m256i-b">__m256i __lasx_xvsubwev_d_w (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_226">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_226">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_226">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_226">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsubwev_d_wu-__m256i-a-__m256i-b">__m256i __lasx_xvsubwev_d_wu (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_227">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_227">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_227">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_227">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsubwev_q_d-__m256i-a-__m256i-b">__m256i __lasx_xvsubwev_q_d (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_228">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_228">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_228">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_228">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsubwev_q_du-__m256i-a-__m256i-b">__m256i __lasx_xvsubwev_q_du (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_229">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_229">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_229">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_229">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsubwod_h_b-__m256i-a-__m256i-b">__m256i __lasx_xvsubwod_h_b (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_230">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_230">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_230">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_230">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsubwod_h_bu-__m256i-a-__m256i-b">__m256i __lasx_xvsubwod_h_bu (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_231">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_231">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_231">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_231">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsubwod_w_h-__m256i-a-__m256i-b">__m256i __lasx_xvsubwod_w_h (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_232">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_232">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_232">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_232">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsubwod_w_hu-__m256i-a-__m256i-b">__m256i __lasx_xvsubwod_w_hu (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_233">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_233">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_233">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_233">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsubwod_d_w-__m256i-a-__m256i-b">__m256i __lasx_xvsubwod_d_w (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_234">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_234">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_234">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_234">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsubwod_d_wu-__m256i-a-__m256i-b">__m256i __lasx_xvsubwod_d_wu (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_235">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_235">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_235">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_235">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsubwod_q_d-__m256i-a-__m256i-b">__m256i __lasx_xvsubwod_q_d (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_236">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_236">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_236">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_236">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsubwod_q_du-__m256i-a-__m256i-b">__m256i __lasx_xvsubwod_q_du (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_237">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_237">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_237">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_237">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    </ul>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../logical/">Logical</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../memory/">Memory Load & Store</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../misc/">Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../permutation/">Permutation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../shift/">Shift</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../shuffling/">Shuffling</a>
+                  </li>
+              </ul>
+              <p class="caption"><span class="caption-text">Lsx</span></p>
+              <ul>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/bitwise_operations/">Bitwise Operations</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/branch/">Branch</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/float_comparison/">Floating Point Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/float_computation/">Floating Point Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/float_conversion/">Floating Point Conversion</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/float_misc/">Floating Point Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/fma/">Fused Multiply-Add</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/integer_comparison/">Integer Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/integer_computation/">Integer Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/logical/">Logical</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/memory/">Memory Load & Store</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/misc/">Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/permutation/">Permutation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/shift/">Shift</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/shuffling/">Shuffling</a>
+                  </li>
+              </ul>
+      </div>
+    </div>
+    </nav>
+
+    <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
+      <nav class="wy-nav-top" role="navigation" aria-label="Mobile navigation menu">
+          <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
+          <a href="../..">Unofficial LoongArch Intrinsics Guide</a>
+        
+      </nav>
+      <div class="wy-nav-content">
+        <div class="rst-content"><div role="navigation" aria-label="breadcrumbs navigation">
+  <ul class="wy-breadcrumbs">
+    <li><a href="../.." class="icon icon-home" aria-label="Docs"></a></li>
+          <li class="breadcrumb-item">Lasx</li>
+      <li class="breadcrumb-item active">Integer Computation</li>
+    <li class="wy-breadcrumbs-aside">
+    </li>
+  </ul>
+  <hr/>
+</div>
+          <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
+            <div class="section" itemprop="articleBody">
+              
+                <h1 id="integer-computation">Integer Computation</h1>
+<h2 id="__m256i-__lasx_xvadd_b-__m256i-a-__m256i-b">__m256i __lasx_xvadd_b (__m256i a, __m256i b)</h2>
+<h3 id="synopsis">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvadd_b (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvadd.b xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description">Description</h3>
+<p>Add 8-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 32; i++) {
+  dst.byte[i] = a.byte[i] + b.byte[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvadd_h-__m256i-a-__m256i-b">__m256i __lasx_xvadd_h (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_1">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvadd_h (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvadd.h xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_1">Description</h3>
+<p>Add 16-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_1">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = a.half[i] + b.half[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_1">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvadd_w-__m256i-a-__m256i-b">__m256i __lasx_xvadd_w (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_2">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvadd_w (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvadd.w xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_2">Description</h3>
+<p>Add 32-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_2">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = a.word[i] + b.word[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_2">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvadd_d-__m256i-a-__m256i-b">__m256i __lasx_xvadd_d (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_3">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvadd_d (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvadd.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_3">Description</h3>
+<p>Add 64-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_3">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = a.dword[i] + b.dword[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_3">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvadd_q-__m256i-a-__m256i-b">__m256i __lasx_xvadd_q (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_4">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvadd_q (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvadd.q xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_4">Description</h3>
+<p>Add 128-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_4">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.qword[i] = a.qword[i] + b.qword[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_4">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvabsd_b-__m256i-a-__m256i-b">__m256i __lasx_xvabsd_b (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_5">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvabsd_b (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvabsd.b xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_5">Description</h3>
+<p>Compute absolute difference of signed 8-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_5">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 32; i++) {
+  dst.byte[i] = ((s8)a.byte[i] &gt; (s8)b.byte[i]) ? (a.byte[i] - b.byte[i])
+                                                : (b.byte[i] - a.byte[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_5">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvabsd_bu-__m256i-a-__m256i-b">__m256i __lasx_xvabsd_bu (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_6">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvabsd_bu (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvabsd.bu xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_6">Description</h3>
+<p>Compute absolute difference of unsigned 8-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_6">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 32; i++) {
+  dst.byte[i] = ((u8)a.byte[i] &gt; (u8)b.byte[i]) ? (a.byte[i] - b.byte[i])
+                                                : (b.byte[i] - a.byte[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_6">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvabsd_h-__m256i-a-__m256i-b">__m256i __lasx_xvabsd_h (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_7">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvabsd_h (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvabsd.h xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_7">Description</h3>
+<p>Compute absolute difference of signed 16-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_7">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = ((s16)a.half[i] &gt; (s16)b.half[i]) ? (a.half[i] - b.half[i])
+                                                  : (b.half[i] - a.half[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_7">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvabsd_hu-__m256i-a-__m256i-b">__m256i __lasx_xvabsd_hu (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_8">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvabsd_hu (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvabsd.hu xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_8">Description</h3>
+<p>Compute absolute difference of unsigned 16-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_8">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = ((u16)a.half[i] &gt; (u16)b.half[i]) ? (a.half[i] - b.half[i])
+                                                  : (b.half[i] - a.half[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_8">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvabsd_w-__m256i-a-__m256i-b">__m256i __lasx_xvabsd_w (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_9">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvabsd_w (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvabsd.w xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_9">Description</h3>
+<p>Compute absolute difference of signed 32-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_9">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = ((s32)a.word[i] &gt; (s32)b.word[i]) ? (a.word[i] - b.word[i])
+                                                  : (b.word[i] - a.word[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_9">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvabsd_wu-__m256i-a-__m256i-b">__m256i __lasx_xvabsd_wu (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_10">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvabsd_wu (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvabsd.wu xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_10">Description</h3>
+<p>Compute absolute difference of unsigned 32-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_10">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = ((u32)a.word[i] &gt; (u32)b.word[i]) ? (a.word[i] - b.word[i])
+                                                  : (b.word[i] - a.word[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_10">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvabsd_d-__m256i-a-__m256i-b">__m256i __lasx_xvabsd_d (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_11">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvabsd_d (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvabsd.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_11">Description</h3>
+<p>Compute absolute difference of signed 64-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_11">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = ((s64)a.dword[i] &gt; (s64)b.dword[i])
+                     ? (a.dword[i] - b.dword[i])
+                     : (b.dword[i] - a.dword[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_11">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvabsd_du-__m256i-a-__m256i-b">__m256i __lasx_xvabsd_du (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_12">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvabsd_du (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvabsd.du xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_12">Description</h3>
+<p>Compute absolute difference of unsigned 64-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_12">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = ((u64)a.dword[i] &gt; (u64)b.dword[i])
+                     ? (a.dword[i] - b.dword[i])
+                     : (b.dword[i] - a.dword[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_12">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvadda_b-__m256i-a-__m256i-b">__m256i __lasx_xvadda_b (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_13">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvadda_b (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvadda.b xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_13">Description</h3>
+<p>Add absolute of 8-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_13">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 32; i++) {
+  dst.byte[i] = abs((s8)a.byte[i]) + abs((s8)b.byte[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_13">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvadda_h-__m256i-a-__m256i-b">__m256i __lasx_xvadda_h (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_14">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvadda_h (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvadda.h xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_14">Description</h3>
+<p>Add absolute of 16-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_14">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = abs((s16)a.half[i]) + abs((s16)b.half[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_14">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvadda_w-__m256i-a-__m256i-b">__m256i __lasx_xvadda_w (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_15">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvadda_w (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvadda.w xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_15">Description</h3>
+<p>Add absolute of 32-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_15">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = abs((s32)a.word[i]) + abs((s32)b.word[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_15">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvadda_d-__m256i-a-__m256i-b">__m256i __lasx_xvadda_d (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_16">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvadda_d (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvadda.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_16">Description</h3>
+<p>Add absolute of 64-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_16">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = abs((s64)a.dword[i]) + abs((s64)b.dword[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_16">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvaddi_bu-__m256i-a-imm0_31-imm">__m256i __lasx_xvaddi_bu (__m256i a, imm0_31 imm)</h2>
+<h3 id="synopsis_17">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvaddi_bu (__m256i a, imm0_31 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvaddi.bu xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_17">Description</h3>
+<p>Add 8-bit elements in <code>a</code> and <code>imm</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_17">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 32; i++) {
+  dst.byte[i] = a.byte[i] + imm;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_17">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvaddi_hu-__m256i-a-imm0_31-imm">__m256i __lasx_xvaddi_hu (__m256i a, imm0_31 imm)</h2>
+<h3 id="synopsis_18">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvaddi_hu (__m256i a, imm0_31 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvaddi.hu xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_18">Description</h3>
+<p>Add 16-bit elements in <code>a</code> and <code>imm</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_18">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = a.half[i] + imm;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_18">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvaddi_wu-__m256i-a-imm0_31-imm">__m256i __lasx_xvaddi_wu (__m256i a, imm0_31 imm)</h2>
+<h3 id="synopsis_19">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvaddi_wu (__m256i a, imm0_31 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvaddi.wu xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_19">Description</h3>
+<p>Add 32-bit elements in <code>a</code> and <code>imm</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_19">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = a.word[i] + imm;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_19">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvaddi_du-__m256i-a-imm0_31-imm">__m256i __lasx_xvaddi_du (__m256i a, imm0_31 imm)</h2>
+<h3 id="synopsis_20">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvaddi_du (__m256i a, imm0_31 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvaddi.du xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_20">Description</h3>
+<p>Add 64-bit elements in <code>a</code> and <code>imm</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_20">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = a.dword[i] + imm;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_20">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvaddwev_h_b-__m256i-a-__m256i-b">__m256i __lasx_xvaddwev_h_b (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_21">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvaddwev_h_b (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvaddwev.h.b xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_21">Description</h3>
+<p>Add even-positioned signed 8-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 16-bit result in <code>dst</code>.</p>
+<h3 id="operation_21">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = (s16)(s8)a.byte[2 * i] + (s16)(s8)b.byte[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_21">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvaddwev_h_bu-__m256i-a-__m256i-b">__m256i __lasx_xvaddwev_h_bu (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_22">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvaddwev_h_bu (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvaddwev.h.bu xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_22">Description</h3>
+<p>Add even-positioned unsigned 8-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 16-bit result in <code>dst</code>.</p>
+<h3 id="operation_22">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = (u16)(u8)a.byte[2 * i] + (u16)(u8)b.byte[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_22">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvaddwev_h_bu_b-__m256i-a-__m256i-b">__m256i __lasx_xvaddwev_h_bu_b (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_23">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvaddwev_h_bu_b (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvaddwev.h.bu.b xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_23">Description</h3>
+<p>Add even-positioned unsigned 8-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 16-bit result in <code>dst</code>.</p>
+<h3 id="operation_23">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = (u16)(u8)a.byte[2 * i] + (s16)(s8)b.byte[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_23">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvaddwev_w_h-__m256i-a-__m256i-b">__m256i __lasx_xvaddwev_w_h (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_24">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvaddwev_w_h (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvaddwev.w.h xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_24">Description</h3>
+<p>Add even-positioned signed 16-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 32-bit result in <code>dst</code>.</p>
+<h3 id="operation_24">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = (s32)(s16)a.half[2 * i] + (s32)(s16)b.half[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_24">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvaddwev_w_hu-__m256i-a-__m256i-b">__m256i __lasx_xvaddwev_w_hu (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_25">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvaddwev_w_hu (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvaddwev.w.hu xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_25">Description</h3>
+<p>Add even-positioned unsigned 16-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 32-bit result in <code>dst</code>.</p>
+<h3 id="operation_25">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = (u32)(u16)a.half[2 * i] + (u32)(u16)b.half[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_25">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvaddwev_w_hu_h-__m256i-a-__m256i-b">__m256i __lasx_xvaddwev_w_hu_h (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_26">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvaddwev_w_hu_h (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvaddwev.w.hu.h xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_26">Description</h3>
+<p>Add even-positioned unsigned 16-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 32-bit result in <code>dst</code>.</p>
+<h3 id="operation_26">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = (u32)(u16)a.half[2 * i] + (s32)(s16)b.half[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_26">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvaddwev_d_w-__m256i-a-__m256i-b">__m256i __lasx_xvaddwev_d_w (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_27">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvaddwev_d_w (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvaddwev.d.w xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_27">Description</h3>
+<p>Add even-positioned signed 32-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 64-bit result in <code>dst</code>.</p>
+<h3 id="operation_27">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = (s64)(s32)a.word[2 * i] + (s64)(s32)b.word[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_27">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvaddwev_d_wu-__m256i-a-__m256i-b">__m256i __lasx_xvaddwev_d_wu (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_28">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvaddwev_d_wu (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvaddwev.d.wu xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_28">Description</h3>
+<p>Add even-positioned unsigned 32-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 64-bit result in <code>dst</code>.</p>
+<h3 id="operation_28">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = (u64)(u32)a.word[2 * i] + (u64)(u32)b.word[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_28">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvaddwev_d_wu_w-__m256i-a-__m256i-b">__m256i __lasx_xvaddwev_d_wu_w (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_29">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvaddwev_d_wu_w (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvaddwev.d.wu.w xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_29">Description</h3>
+<p>Add even-positioned unsigned 32-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 64-bit result in <code>dst</code>.</p>
+<h3 id="operation_29">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = (u64)(u32)a.word[2 * i] + (s64)(s32)b.word[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_29">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvaddwev_q_d-__m256i-a-__m256i-b">__m256i __lasx_xvaddwev_q_d (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_30">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvaddwev_q_d (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvaddwev.q.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_30">Description</h3>
+<p>Add even-positioned signed 64-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 128-bit result in <code>dst</code>.</p>
+<h3 id="operation_30">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.qword[i] = (s128)(s64)a.dword[2 * i] + (s128)(s64)b.dword[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_30">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvaddwev_q_du-__m256i-a-__m256i-b">__m256i __lasx_xvaddwev_q_du (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_31">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvaddwev_q_du (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvaddwev.q.du xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_31">Description</h3>
+<p>Add even-positioned unsigned 64-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 128-bit result in <code>dst</code>.</p>
+<h3 id="operation_31">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[2 * i] + (u128)(u64)b.dword[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_31">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvaddwev_q_du_d-__m256i-a-__m256i-b">__m256i __lasx_xvaddwev_q_du_d (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_32">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvaddwev_q_du_d (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvaddwev.q.du.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_32">Description</h3>
+<p>Add even-positioned unsigned 64-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 128-bit result in <code>dst</code>.</p>
+<h3 id="operation_32">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[2 * i] + (s128)(s64)b.dword[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_32">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvaddwod_h_b-__m256i-a-__m256i-b">__m256i __lasx_xvaddwod_h_b (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_33">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvaddwod_h_b (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvaddwod.h.b xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_33">Description</h3>
+<p>Add odd-positioned signed 8-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 16-bit result in <code>dst</code>.</p>
+<h3 id="operation_33">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = (s16)(s8)a.byte[2 * i + 1] + (s16)(s8)b.byte[2 * i + 1];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_33">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvaddwod_h_bu-__m256i-a-__m256i-b">__m256i __lasx_xvaddwod_h_bu (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_34">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvaddwod_h_bu (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvaddwod.h.bu xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_34">Description</h3>
+<p>Add odd-positioned unsigned 8-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 16-bit result in <code>dst</code>.</p>
+<h3 id="operation_34">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = (u16)(u8)a.byte[2 * i + 1] + (u16)(u8)b.byte[2 * i + 1];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_34">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvaddwod_h_bu_b-__m256i-a-__m256i-b">__m256i __lasx_xvaddwod_h_bu_b (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_35">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvaddwod_h_bu_b (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvaddwod.h.bu.b xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_35">Description</h3>
+<p>Add odd-positioned unsigned 8-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 16-bit result in <code>dst</code>.</p>
+<h3 id="operation_35">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = (u16)(u8)a.byte[2 * i + 1] + (s16)(s8)b.byte[2 * i + 1];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_35">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvaddwod_w_h-__m256i-a-__m256i-b">__m256i __lasx_xvaddwod_w_h (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_36">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvaddwod_w_h (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvaddwod.w.h xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_36">Description</h3>
+<p>Add odd-positioned signed 16-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 32-bit result in <code>dst</code>.</p>
+<h3 id="operation_36">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = (s32)(s16)a.half[2 * i + 1] + (s32)(s16)b.half[2 * i + 1];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_36">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvaddwod_w_hu-__m256i-a-__m256i-b">__m256i __lasx_xvaddwod_w_hu (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_37">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvaddwod_w_hu (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvaddwod.w.hu xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_37">Description</h3>
+<p>Add odd-positioned unsigned 16-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 32-bit result in <code>dst</code>.</p>
+<h3 id="operation_37">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = (u32)(u16)a.half[2 * i + 1] + (u32)(u16)b.half[2 * i + 1];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_37">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvaddwod_w_hu_h-__m256i-a-__m256i-b">__m256i __lasx_xvaddwod_w_hu_h (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_38">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvaddwod_w_hu_h (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvaddwod.w.hu.h xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_38">Description</h3>
+<p>Add odd-positioned unsigned 16-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 32-bit result in <code>dst</code>.</p>
+<h3 id="operation_38">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = (u32)(u16)a.half[2 * i + 1] + (s32)(s16)b.half[2 * i + 1];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_38">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvaddwod_d_w-__m256i-a-__m256i-b">__m256i __lasx_xvaddwod_d_w (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_39">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvaddwod_d_w (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvaddwod.d.w xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_39">Description</h3>
+<p>Add odd-positioned signed 32-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 64-bit result in <code>dst</code>.</p>
+<h3 id="operation_39">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = (s64)(s32)a.word[2 * i + 1] + (s64)(s32)b.word[2 * i + 1];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_39">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvaddwod_d_wu-__m256i-a-__m256i-b">__m256i __lasx_xvaddwod_d_wu (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_40">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvaddwod_d_wu (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvaddwod.d.wu xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_40">Description</h3>
+<p>Add odd-positioned unsigned 32-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 64-bit result in <code>dst</code>.</p>
+<h3 id="operation_40">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = (u64)(u32)a.word[2 * i + 1] + (u64)(u32)b.word[2 * i + 1];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_40">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvaddwod_d_wu_w-__m256i-a-__m256i-b">__m256i __lasx_xvaddwod_d_wu_w (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_41">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvaddwod_d_wu_w (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvaddwod.d.wu.w xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_41">Description</h3>
+<p>Add odd-positioned unsigned 32-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 64-bit result in <code>dst</code>.</p>
+<h3 id="operation_41">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = (u64)(u32)a.word[2 * i + 1] + (s64)(s32)b.word[2 * i + 1];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_41">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvaddwod_q_d-__m256i-a-__m256i-b">__m256i __lasx_xvaddwod_q_d (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_42">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvaddwod_q_d (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvaddwod.q.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_42">Description</h3>
+<p>Add odd-positioned signed 64-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 128-bit result in <code>dst</code>.</p>
+<h3 id="operation_42">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.qword[i] = (s128)(s64)a.dword[2 * i + 1] + (s128)(s64)b.dword[2 * i + 1];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_42">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvaddwod_q_du-__m256i-a-__m256i-b">__m256i __lasx_xvaddwod_q_du (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_43">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvaddwod_q_du (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvaddwod.q.du xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_43">Description</h3>
+<p>Add odd-positioned unsigned 64-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 128-bit result in <code>dst</code>.</p>
+<h3 id="operation_43">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] + (u128)(u64)b.dword[2 * i + 1];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_43">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvaddwod_q_du_d-__m256i-a-__m256i-b">__m256i __lasx_xvaddwod_q_du_d (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_44">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvaddwod_q_du_d (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvaddwod.q.du.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_44">Description</h3>
+<p>Add odd-positioned unsigned 64-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 128-bit result in <code>dst</code>.</p>
+<h3 id="operation_44">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] + (s128)(s64)b.dword[2 * i + 1];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_44">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvavg_b-__m256i-a-__m256i-b">__m256i __lasx_xvavg_b (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_45">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvavg_b (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvavg.b xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_45">Description</h3>
+<p>Compute the average (rounded towards negative infinity) of signed 8-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_45">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 32; i++) {
+  dst.byte[i] = ((s8)a.byte[i] &gt;&gt; 1) + ((s8)b.byte[i] &gt;&gt; 1) +
+                ((a.byte[i] &amp; b.byte[i]) &amp; 1);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_45">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvavg_bu-__m256i-a-__m256i-b">__m256i __lasx_xvavg_bu (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_46">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvavg_bu (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvavg.bu xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_46">Description</h3>
+<p>Compute the average (rounded towards negative infinity) of unsigned 8-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_46">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 32; i++) {
+  dst.byte[i] = ((u8)a.byte[i] &gt;&gt; 1) + ((u8)b.byte[i] &gt;&gt; 1) +
+                ((a.byte[i] &amp; b.byte[i]) &amp; 1);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_46">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvavg_h-__m256i-a-__m256i-b">__m256i __lasx_xvavg_h (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_47">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvavg_h (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvavg.h xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_47">Description</h3>
+<p>Compute the average (rounded towards negative infinity) of signed 16-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_47">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = ((s16)a.half[i] &gt;&gt; 1) + ((s16)b.half[i] &gt;&gt; 1) +
+                ((a.half[i] &amp; b.half[i]) &amp; 1);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_47">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvavg_hu-__m256i-a-__m256i-b">__m256i __lasx_xvavg_hu (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_48">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvavg_hu (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvavg.hu xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_48">Description</h3>
+<p>Compute the average (rounded towards negative infinity) of unsigned 16-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_48">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = ((u16)a.half[i] &gt;&gt; 1) + ((u16)b.half[i] &gt;&gt; 1) +
+                ((a.half[i] &amp; b.half[i]) &amp; 1);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_48">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvavg_w-__m256i-a-__m256i-b">__m256i __lasx_xvavg_w (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_49">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvavg_w (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvavg.w xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_49">Description</h3>
+<p>Compute the average (rounded towards negative infinity) of signed 32-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_49">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = ((s32)a.word[i] &gt;&gt; 1) + ((s32)b.word[i] &gt;&gt; 1) +
+                ((a.word[i] &amp; b.word[i]) &amp; 1);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_49">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvavg_wu-__m256i-a-__m256i-b">__m256i __lasx_xvavg_wu (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_50">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvavg_wu (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvavg.wu xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_50">Description</h3>
+<p>Compute the average (rounded towards negative infinity) of unsigned 32-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_50">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = ((u32)a.word[i] &gt;&gt; 1) + ((u32)b.word[i] &gt;&gt; 1) +
+                ((a.word[i] &amp; b.word[i]) &amp; 1);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_50">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvavg_d-__m256i-a-__m256i-b">__m256i __lasx_xvavg_d (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_51">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvavg_d (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvavg.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_51">Description</h3>
+<p>Compute the average (rounded towards negative infinity) of signed 64-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_51">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = ((s64)a.dword[i] &gt;&gt; 1) + ((s64)b.dword[i] &gt;&gt; 1) +
+                 ((a.dword[i] &amp; b.dword[i]) &amp; 1);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_51">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvavg_du-__m256i-a-__m256i-b">__m256i __lasx_xvavg_du (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_52">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvavg_du (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvavg.du xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_52">Description</h3>
+<p>Compute the average (rounded towards negative infinity) of unsigned 64-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_52">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = ((u64)a.dword[i] &gt;&gt; 1) + ((u64)b.dword[i] &gt;&gt; 1) +
+                 ((a.dword[i] &amp; b.dword[i]) &amp; 1);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_52">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvavgr_b-__m256i-a-__m256i-b">__m256i __lasx_xvavgr_b (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_53">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvavgr_b (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvavgr.b xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_53">Description</h3>
+<p>Compute the average (rounded towards positive infinity) of signed 8-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_53">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 32; i++) {
+  dst.byte[i] = ((s8)a.byte[i] &gt;&gt; 1) + ((s8)b.byte[i] &gt;&gt; 1) +
+                ((a.byte[i] | b.byte[i]) &amp; 1);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_53">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvavgr_bu-__m256i-a-__m256i-b">__m256i __lasx_xvavgr_bu (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_54">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvavgr_bu (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvavgr.bu xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_54">Description</h3>
+<p>Compute the average (rounded towards positive infinity) of unsigned 8-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_54">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 32; i++) {
+  dst.byte[i] = ((u8)a.byte[i] &gt;&gt; 1) + ((u8)b.byte[i] &gt;&gt; 1) +
+                ((a.byte[i] | b.byte[i]) &amp; 1);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_54">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvavgr_h-__m256i-a-__m256i-b">__m256i __lasx_xvavgr_h (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_55">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvavgr_h (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvavgr.h xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_55">Description</h3>
+<p>Compute the average (rounded towards positive infinity) of signed 16-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_55">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = ((s16)a.half[i] &gt;&gt; 1) + ((s16)b.half[i] &gt;&gt; 1) +
+                ((a.half[i] | b.half[i]) &amp; 1);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_55">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvavgr_hu-__m256i-a-__m256i-b">__m256i __lasx_xvavgr_hu (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_56">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvavgr_hu (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvavgr.hu xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_56">Description</h3>
+<p>Compute the average (rounded towards positive infinity) of unsigned 16-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_56">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = ((u16)a.half[i] &gt;&gt; 1) + ((u16)b.half[i] &gt;&gt; 1) +
+                ((a.half[i] | b.half[i]) &amp; 1);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_56">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvavgr_w-__m256i-a-__m256i-b">__m256i __lasx_xvavgr_w (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_57">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvavgr_w (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvavgr.w xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_57">Description</h3>
+<p>Compute the average (rounded towards positive infinity) of signed 32-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_57">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = ((s32)a.word[i] &gt;&gt; 1) + ((s32)b.word[i] &gt;&gt; 1) +
+                ((a.word[i] | b.word[i]) &amp; 1);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_57">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvavgr_wu-__m256i-a-__m256i-b">__m256i __lasx_xvavgr_wu (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_58">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvavgr_wu (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvavgr.wu xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_58">Description</h3>
+<p>Compute the average (rounded towards positive infinity) of unsigned 32-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_58">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = ((u32)a.word[i] &gt;&gt; 1) + ((u32)b.word[i] &gt;&gt; 1) +
+                ((a.word[i] | b.word[i]) &amp; 1);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_58">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvavgr_d-__m256i-a-__m256i-b">__m256i __lasx_xvavgr_d (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_59">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvavgr_d (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvavgr.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_59">Description</h3>
+<p>Compute the average (rounded towards positive infinity) of signed 64-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_59">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = ((s64)a.dword[i] &gt;&gt; 1) + ((s64)b.dword[i] &gt;&gt; 1) +
+                 ((a.dword[i] | b.dword[i]) &amp; 1);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_59">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvavgr_du-__m256i-a-__m256i-b">__m256i __lasx_xvavgr_du (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_60">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvavgr_du (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvavgr.du xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_60">Description</h3>
+<p>Compute the average (rounded towards positive infinity) of unsigned 64-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_60">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = ((u64)a.dword[i] &gt;&gt; 1) + ((u64)b.dword[i] &gt;&gt; 1) +
+                 ((a.dword[i] | b.dword[i]) &amp; 1);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_60">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvdiv_b-__m256i-a-__m256i-b">__m256i __lasx_xvdiv_b (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_61">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvdiv_b (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvdiv.b xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_61">Description</h3>
+<p>Divide signed 8-bit elements in <code>a</code> by elements in <code>b</code>.</p>
+<h3 id="operation_61">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 32; i++) {
+  dst.byte[i] = (b.byte[i] == 0) ? 0 : ((s8)a.byte[i] / (s8)b.byte[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_61">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>29, 32</td>
+<td>0.06(1/15.5)</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>32, 36</td>
+<td>0.05(1/20.5)</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvdiv_bu-__m256i-a-__m256i-b">__m256i __lasx_xvdiv_bu (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_62">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvdiv_bu (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvdiv.bu xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_62">Description</h3>
+<p>Divide unsigned 8-bit elements in <code>a</code> by elements in <code>b</code>.</p>
+<h3 id="operation_62">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 32; i++) {
+  dst.byte[i] = (b.byte[i] == 0) ? 0 : ((u8)a.byte[i] / (u8)b.byte[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_62">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>29, 33</td>
+<td>0.06(1/16.5)</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>29, 36</td>
+<td>0.05(1/20.5)</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvdiv_h-__m256i-a-__m256i-b">__m256i __lasx_xvdiv_h (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_63">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvdiv_h (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvdiv.h xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_63">Description</h3>
+<p>Divide signed 16-bit elements in <code>a</code> by elements in <code>b</code>.</p>
+<h3 id="operation_63">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = (b.half[i] == 0) ? 0 : ((s16)a.half[i] / (s16)b.half[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_63">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>17</td>
+<td>0.12(1/8.5)</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>21.5, 22</td>
+<td>0.08(1/13)</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvdiv_hu-__m256i-a-__m256i-b">__m256i __lasx_xvdiv_hu (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_64">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvdiv_hu (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvdiv.hu xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_64">Description</h3>
+<p>Divide unsigned 16-bit elements in <code>a</code> by elements in <code>b</code>.</p>
+<h3 id="operation_64">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = (b.half[i] == 0) ? 0 : ((u16)a.half[i] / (u16)b.half[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_64">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>17, 22</td>
+<td>0.11(1/9)</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>17, 21.5</td>
+<td>0.07(1/15)</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvdiv_w-__m256i-a-__m256i-b">__m256i __lasx_xvdiv_w (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_65">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvdiv_w (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvdiv.w xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_65">Description</h3>
+<p>Divide signed 32-bit elements in <code>a</code> by elements in <code>b</code>.</p>
+<h3 id="operation_65">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = (b.word[i] == 0) ? 0 : ((s32)a.word[i] / (s32)b.word[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_65">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>11</td>
+<td>0.18(1/5.5)</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>11, 17.5</td>
+<td>0.09(1/11.5)</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvdiv_wu-__m256i-a-__m256i-b">__m256i __lasx_xvdiv_wu (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_66">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvdiv_wu (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvdiv.wu xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_66">Description</h3>
+<p>Divide unsigned 32-bit elements in <code>a</code> by elements in <code>b</code>.</p>
+<h3 id="operation_66">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = (b.word[i] == 0) ? 0 : ((u32)a.word[i] / (u32)b.word[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_66">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>11</td>
+<td>0.18(1/5.5)</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>11, 17.5</td>
+<td>0.07(1/15)</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvdiv_d-__m256i-a-__m256i-b">__m256i __lasx_xvdiv_d (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_67">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvdiv_d (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvdiv.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_67">Description</h3>
+<p>Divide signed 64-bit elements in <code>a</code> by elements in <code>b</code>.</p>
+<h3 id="operation_67">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = (b.dword[i] == 0) ? 0 : ((s64)a.dword[i] / (s64)b.dword[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_67">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>8</td>
+<td>0.25(1/4)</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>8, 18.5</td>
+<td>0.11(1/9)</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvdiv_du-__m256i-a-__m256i-b">__m256i __lasx_xvdiv_du (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_68">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvdiv_du (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvdiv.du xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_68">Description</h3>
+<p>Divide unsigned 64-bit elements in <code>a</code> by elements in <code>b</code>.</p>
+<h3 id="operation_68">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = (b.dword[i] == 0) ? 0 : ((u64)a.dword[i] / (u64)b.dword[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_68">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>8</td>
+<td>0.25(1/4)</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>8, 18.5</td>
+<td>0.11(1/9)</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvhaddw_h_b-__m256i-a-__m256i-b">__m256i __lasx_xvhaddw_h_b (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_69">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvhaddw_h_b (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvhaddw.h.b xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_69">Description</h3>
+<p>Add odd-positioned signed 8-bit elements in <code>a</code> to even-positioned signed 8-bit elements in <code>b</code> to get 16-bit result.</p>
+<h3 id="operation_69">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = (s16)(s8)a.byte[2 * i + 1] + (s16)(s8)b.byte[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_69">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvhaddw_hu_bu-__m256i-a-__m256i-b">__m256i __lasx_xvhaddw_hu_bu (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_70">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvhaddw_hu_bu (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvhaddw.hu.bu xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_70">Description</h3>
+<p>Add odd-positioned unsigned 8-bit elements in <code>a</code> to even-positioned unsigned 8-bit elements in <code>b</code> to get 16-bit result.</p>
+<h3 id="operation_70">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = (u16)(u8)a.byte[2 * i + 1] + (u16)(u8)b.byte[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_70">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvhaddw_w_h-__m256i-a-__m256i-b">__m256i __lasx_xvhaddw_w_h (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_71">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvhaddw_w_h (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvhaddw.w.h xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_71">Description</h3>
+<p>Add odd-positioned signed 16-bit elements in <code>a</code> to even-positioned signed 16-bit elements in <code>b</code> to get 32-bit result.</p>
+<h3 id="operation_71">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = (s32)(s16)a.half[2 * i + 1] + (s32)(s16)b.half[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_71">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvhaddw_wu_hu-__m256i-a-__m256i-b">__m256i __lasx_xvhaddw_wu_hu (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_72">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvhaddw_wu_hu (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvhaddw.wu.hu xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_72">Description</h3>
+<p>Add odd-positioned unsigned 16-bit elements in <code>a</code> to even-positioned unsigned 16-bit elements in <code>b</code> to get 32-bit result.</p>
+<h3 id="operation_72">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = (u32)(u16)a.half[2 * i + 1] + (u32)(u16)b.half[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_72">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvhaddw_d_w-__m256i-a-__m256i-b">__m256i __lasx_xvhaddw_d_w (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_73">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvhaddw_d_w (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvhaddw.d.w xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_73">Description</h3>
+<p>Add odd-positioned signed 32-bit elements in <code>a</code> to even-positioned signed 32-bit elements in <code>b</code> to get 64-bit result.</p>
+<h3 id="operation_73">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = (s64)(s32)a.word[2 * i + 1] + (s64)(s32)b.word[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_73">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvhaddw_du_wu-__m256i-a-__m256i-b">__m256i __lasx_xvhaddw_du_wu (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_74">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvhaddw_du_wu (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvhaddw.du.wu xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_74">Description</h3>
+<p>Add odd-positioned unsigned 32-bit elements in <code>a</code> to even-positioned unsigned 32-bit elements in <code>b</code> to get 64-bit result.</p>
+<h3 id="operation_74">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = (u64)(u32)a.word[2 * i + 1] + (u64)(u32)b.word[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_74">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvhaddw_q_d-__m256i-a-__m256i-b">__m256i __lasx_xvhaddw_q_d (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_75">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvhaddw_q_d (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvhaddw.q.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_75">Description</h3>
+<p>Add odd-positioned signed 64-bit elements in <code>a</code> to even-positioned signed 64-bit elements in <code>b</code> to get 128-bit result.</p>
+<h3 id="operation_75">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.qword[i] = (s128)(s64)a.dword[2 * i + 1] + (s128)(s64)b.dword[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_75">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvhaddw_qu_du-__m256i-a-__m256i-b">__m256i __lasx_xvhaddw_qu_du (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_76">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvhaddw_qu_du (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvhaddw.qu.du xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_76">Description</h3>
+<p>Add odd-positioned unsigned 64-bit elements in <code>a</code> to even-positioned unsigned 64-bit elements in <code>b</code> to get 128-bit result.</p>
+<h3 id="operation_76">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] + (u128)(u64)b.dword[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_76">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvhsubw_h_b-__m256i-a-__m256i-b">__m256i __lasx_xvhsubw_h_b (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_77">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvhsubw_h_b (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvhsubw.h.b xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_77">Description</h3>
+<p>Subtract odd-positioned signed 8-bit elements in <code>a</code> by even-positioned signed 8-bit elements in <code>b</code> to get 16-bit result.</p>
+<h3 id="operation_77">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = (s16)(s8)a.byte[2 * i + 1] - (s16)(s8)b.byte[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_77">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvhsubw_hu_bu-__m256i-a-__m256i-b">__m256i __lasx_xvhsubw_hu_bu (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_78">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvhsubw_hu_bu (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvhsubw.hu.bu xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_78">Description</h3>
+<p>Subtract odd-positioned unsigned 8-bit elements in <code>a</code> by even-positioned unsigned 8-bit elements in <code>b</code> to get 16-bit result.</p>
+<h3 id="operation_78">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = (u16)(u8)a.byte[2 * i + 1] - (u16)(u8)b.byte[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_78">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvhsubw_w_h-__m256i-a-__m256i-b">__m256i __lasx_xvhsubw_w_h (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_79">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvhsubw_w_h (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvhsubw.w.h xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_79">Description</h3>
+<p>Subtract odd-positioned signed 16-bit elements in <code>a</code> by even-positioned signed 16-bit elements in <code>b</code> to get 32-bit result.</p>
+<h3 id="operation_79">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = (s32)(s16)a.half[2 * i + 1] - (s32)(s16)b.half[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_79">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvhsubw_wu_hu-__m256i-a-__m256i-b">__m256i __lasx_xvhsubw_wu_hu (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_80">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvhsubw_wu_hu (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvhsubw.wu.hu xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_80">Description</h3>
+<p>Subtract odd-positioned unsigned 16-bit elements in <code>a</code> by even-positioned unsigned 16-bit elements in <code>b</code> to get 32-bit result.</p>
+<h3 id="operation_80">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = (u32)(u16)a.half[2 * i + 1] - (u32)(u16)b.half[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_80">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvhsubw_d_w-__m256i-a-__m256i-b">__m256i __lasx_xvhsubw_d_w (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_81">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvhsubw_d_w (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvhsubw.d.w xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_81">Description</h3>
+<p>Subtract odd-positioned signed 32-bit elements in <code>a</code> by even-positioned signed 32-bit elements in <code>b</code> to get 64-bit result.</p>
+<h3 id="operation_81">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = (s64)(s32)a.word[2 * i + 1] - (s64)(s32)b.word[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_81">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvhsubw_du_wu-__m256i-a-__m256i-b">__m256i __lasx_xvhsubw_du_wu (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_82">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvhsubw_du_wu (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvhsubw.du.wu xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_82">Description</h3>
+<p>Subtract odd-positioned unsigned 32-bit elements in <code>a</code> by even-positioned unsigned 32-bit elements in <code>b</code> to get 64-bit result.</p>
+<h3 id="operation_82">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = (u64)(u32)a.word[2 * i + 1] - (u64)(u32)b.word[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_82">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvhsubw_q_d-__m256i-a-__m256i-b">__m256i __lasx_xvhsubw_q_d (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_83">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvhsubw_q_d (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvhsubw.q.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_83">Description</h3>
+<p>Subtract odd-positioned signed 64-bit elements in <code>a</code> by even-positioned signed 64-bit elements in <code>b</code> to get 128-bit result.</p>
+<h3 id="operation_83">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.qword[i] = (s128)(s64)a.dword[2 * i + 1] - (s128)(s64)b.dword[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_83">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvhsubw_qu_du-__m256i-a-__m256i-b">__m256i __lasx_xvhsubw_qu_du (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_84">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvhsubw_qu_du (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvhsubw.qu.du xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_84">Description</h3>
+<p>Subtract odd-positioned unsigned 64-bit elements in <code>a</code> by even-positioned unsigned 64-bit elements in <code>b</code> to get 128-bit result.</p>
+<h3 id="operation_84">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] - (u128)(u64)b.dword[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_84">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmadd_b-__m256i-a-__m256i-b-__m256i-c">__m256i __lasx_xvmadd_b (__m256i a, __m256i b, __m256i c)</h2>
+<h3 id="synopsis_85">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmadd_b (__m256i a, __m256i b, __m256i c)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmadd.b xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_85">Description</h3>
+<p>Multiply 8-bit elements in <code>b</code> and <code>c</code>, add to elements in <code>a</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_85">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 32; i++) {
+  dst.byte[i] = b.byte[i] * c.byte[i] + a.byte[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_85">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmadd_h-__m256i-a-__m256i-b-__m256i-c">__m256i __lasx_xvmadd_h (__m256i a, __m256i b, __m256i c)</h2>
+<h3 id="synopsis_86">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmadd_h (__m256i a, __m256i b, __m256i c)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmadd.h xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_86">Description</h3>
+<p>Multiply 16-bit elements in <code>b</code> and <code>c</code>, add to elements in <code>a</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_86">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = b.half[i] * c.half[i] + a.half[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_86">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmadd_w-__m256i-a-__m256i-b-__m256i-c">__m256i __lasx_xvmadd_w (__m256i a, __m256i b, __m256i c)</h2>
+<h3 id="synopsis_87">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmadd_w (__m256i a, __m256i b, __m256i c)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmadd.w xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_87">Description</h3>
+<p>Multiply 32-bit elements in <code>b</code> and <code>c</code>, add to elements in <code>a</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_87">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = b.word[i] * c.word[i] + a.word[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_87">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmadd_d-__m256i-a-__m256i-b-__m256i-c">__m256i __lasx_xvmadd_d (__m256i a, __m256i b, __m256i c)</h2>
+<h3 id="synopsis_88">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmadd_d (__m256i a, __m256i b, __m256i c)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmadd.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_88">Description</h3>
+<p>Multiply 64-bit elements in <code>b</code> and <code>c</code>, add to elements in <code>a</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_88">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = b.dword[i] * c.dword[i] + a.dword[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_88">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmaddwev_h_b-__m256i-a-__m256i-b-__m256i-c">__m256i __lasx_xvmaddwev_h_b (__m256i a, __m256i b, __m256i c)</h2>
+<h3 id="synopsis_89">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmaddwev_h_b (__m256i a, __m256i b, __m256i c)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmaddwev.h.b xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_89">Description</h3>
+<p>Multiply even-positioned signed 8-bit elements in <code>b</code> and signed elements in <code>c</code>, add to 16-bit elements in <code>a</code>.</p>
+<h3 id="operation_89">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] =
+      (s16)(s8)b.byte[2 * i] * (s16)(s8)c.byte[2 * i] + (s16)a.half[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_89">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmaddwev_h_bu-__m256i-a-__m256i-b-__m256i-c">__m256i __lasx_xvmaddwev_h_bu (__m256i a, __m256i b, __m256i c)</h2>
+<h3 id="synopsis_90">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmaddwev_h_bu (__m256i a, __m256i b, __m256i c)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmaddwev.h.bu xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_90">Description</h3>
+<p>Multiply even-positioned unsigned 8-bit elements in <code>b</code> and unsigned elements in <code>c</code>, add to 16-bit elements in <code>a</code>.</p>
+<h3 id="operation_90">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] =
+      (u16)(u8)b.byte[2 * i] * (u16)(u8)c.byte[2 * i] + (u16)a.half[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_90">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmaddwev_h_bu_b-__m256i-a-__m256i-b-__m256i-c">__m256i __lasx_xvmaddwev_h_bu_b (__m256i a, __m256i b, __m256i c)</h2>
+<h3 id="synopsis_91">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmaddwev_h_bu_b (__m256i a, __m256i b, __m256i c)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmaddwev.h.bu.b xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_91">Description</h3>
+<p>Multiply even-positioned unsigned 8-bit elements in <code>b</code> and signed elements in <code>c</code>, add to 16-bit elements in <code>a</code>.</p>
+<h3 id="operation_91">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] =
+      (u16)(u8)b.byte[2 * i] * (s16)(s8)c.byte[2 * i] + (s16)a.half[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_91">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmaddwev_w_h-__m256i-a-__m256i-b-__m256i-c">__m256i __lasx_xvmaddwev_w_h (__m256i a, __m256i b, __m256i c)</h2>
+<h3 id="synopsis_92">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmaddwev_w_h (__m256i a, __m256i b, __m256i c)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmaddwev.w.h xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_92">Description</h3>
+<p>Multiply even-positioned signed 16-bit elements in <code>b</code> and signed elements in <code>c</code>, add to 32-bit elements in <code>a</code>.</p>
+<h3 id="operation_92">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] =
+      (s32)(s16)b.half[2 * i] * (s32)(s16)c.half[2 * i] + (s32)a.word[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_92">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmaddwev_w_hu-__m256i-a-__m256i-b-__m256i-c">__m256i __lasx_xvmaddwev_w_hu (__m256i a, __m256i b, __m256i c)</h2>
+<h3 id="synopsis_93">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmaddwev_w_hu (__m256i a, __m256i b, __m256i c)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmaddwev.w.hu xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_93">Description</h3>
+<p>Multiply even-positioned unsigned 16-bit elements in <code>b</code> and unsigned elements in <code>c</code>, add to 32-bit elements in <code>a</code>.</p>
+<h3 id="operation_93">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] =
+      (u32)(u16)b.half[2 * i] * (u32)(u16)c.half[2 * i] + (u32)a.word[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_93">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmaddwev_w_hu_h-__m256i-a-__m256i-b-__m256i-c">__m256i __lasx_xvmaddwev_w_hu_h (__m256i a, __m256i b, __m256i c)</h2>
+<h3 id="synopsis_94">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmaddwev_w_hu_h (__m256i a, __m256i b, __m256i c)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmaddwev.w.hu.h xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_94">Description</h3>
+<p>Multiply even-positioned unsigned 16-bit elements in <code>b</code> and signed elements in <code>c</code>, add to 32-bit elements in <code>a</code>.</p>
+<h3 id="operation_94">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] =
+      (u32)(u16)b.half[2 * i] * (s32)(s16)c.half[2 * i] + (s32)a.word[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_94">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmaddwev_d_w-__m256i-a-__m256i-b-__m256i-c">__m256i __lasx_xvmaddwev_d_w (__m256i a, __m256i b, __m256i c)</h2>
+<h3 id="synopsis_95">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmaddwev_d_w (__m256i a, __m256i b, __m256i c)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmaddwev.d.w xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_95">Description</h3>
+<p>Multiply even-positioned signed 32-bit elements in <code>b</code> and signed elements in <code>c</code>, add to 64-bit elements in <code>a</code>.</p>
+<h3 id="operation_95">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] =
+      (s64)(s32)b.word[2 * i] * (s64)(s32)c.word[2 * i] + (s64)a.dword[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_95">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmaddwev_d_wu-__m256i-a-__m256i-b-__m256i-c">__m256i __lasx_xvmaddwev_d_wu (__m256i a, __m256i b, __m256i c)</h2>
+<h3 id="synopsis_96">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmaddwev_d_wu (__m256i a, __m256i b, __m256i c)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmaddwev.d.wu xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_96">Description</h3>
+<p>Multiply even-positioned unsigned 32-bit elements in <code>b</code> and unsigned elements in <code>c</code>, add to 64-bit elements in <code>a</code>.</p>
+<h3 id="operation_96">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] =
+      (u64)(u32)b.word[2 * i] * (u64)(u32)c.word[2 * i] + (u64)a.dword[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_96">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmaddwev_d_wu_w-__m256i-a-__m256i-b-__m256i-c">__m256i __lasx_xvmaddwev_d_wu_w (__m256i a, __m256i b, __m256i c)</h2>
+<h3 id="synopsis_97">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmaddwev_d_wu_w (__m256i a, __m256i b, __m256i c)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmaddwev.d.wu.w xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_97">Description</h3>
+<p>Multiply even-positioned unsigned 32-bit elements in <code>b</code> and signed elements in <code>c</code>, add to 64-bit elements in <code>a</code>.</p>
+<h3 id="operation_97">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] =
+      (u64)(u32)b.word[2 * i] * (s64)(s32)c.word[2 * i] + (s64)a.dword[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_97">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmaddwev_q_d-__m256i-a-__m256i-b-__m256i-c">__m256i __lasx_xvmaddwev_q_d (__m256i a, __m256i b, __m256i c)</h2>
+<h3 id="synopsis_98">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmaddwev_q_d (__m256i a, __m256i b, __m256i c)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmaddwev.q.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_98">Description</h3>
+<p>Multiply even-positioned signed 64-bit elements in <code>b</code> and signed elements in <code>c</code>, add to 128-bit elements in <code>a</code>.</p>
+<h3 id="operation_98">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.qword[i] =
+      (s128)(s64)b.dword[2 * i] * (s128)(s64)c.dword[2 * i] + (s128)a.qword[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_98">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>7</td>
+<td>1.14</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>7</td>
+<td>1.14</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmaddwev_q_du-__m256i-a-__m256i-b-__m256i-c">__m256i __lasx_xvmaddwev_q_du (__m256i a, __m256i b, __m256i c)</h2>
+<h3 id="synopsis_99">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmaddwev_q_du (__m256i a, __m256i b, __m256i c)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmaddwev.q.du xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_99">Description</h3>
+<p>Multiply even-positioned unsigned 64-bit elements in <code>b</code> and unsigned elements in <code>c</code>, add to 128-bit elements in <code>a</code>.</p>
+<h3 id="operation_99">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.qword[i] =
+      (u128)(u64)b.dword[2 * i] * (u128)(u64)c.dword[2 * i] + (u128)a.qword[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_99">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>7</td>
+<td>1.14</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>7</td>
+<td>1.14</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmaddwev_q_du_d-__m256i-a-__m256i-b-__m256i-c">__m256i __lasx_xvmaddwev_q_du_d (__m256i a, __m256i b, __m256i c)</h2>
+<h3 id="synopsis_100">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmaddwev_q_du_d (__m256i a, __m256i b, __m256i c)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmaddwev.q.du.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_100">Description</h3>
+<p>Multiply even-positioned unsigned 64-bit elements in <code>b</code> and signed elements in <code>c</code>, add to 128-bit elements in <code>a</code>.</p>
+<h3 id="operation_100">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.qword[i] =
+      (u128)(u64)b.dword[2 * i] * (s128)(s64)c.dword[2 * i] + (s128)a.qword[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_100">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>7</td>
+<td>1.14</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>7</td>
+<td>1.14</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmaddwod_h_b-__m256i-a-__m256i-b-__m256i-c">__m256i __lasx_xvmaddwod_h_b (__m256i a, __m256i b, __m256i c)</h2>
+<h3 id="synopsis_101">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmaddwod_h_b (__m256i a, __m256i b, __m256i c)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmaddwod.h.b xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_101">Description</h3>
+<p>Multiply odd-positioned signed 8-bit elements in <code>b</code> and signed elements in <code>c</code>, add to 16-bit elements in <code>a</code>.</p>
+<h3 id="operation_101">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] =
+      (s16)(s8)b.byte[2 * i + 1] * (s16)(s8)c.byte[2 * i + 1] + (s16)a.half[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_101">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmaddwod_h_bu-__m256i-a-__m256i-b-__m256i-c">__m256i __lasx_xvmaddwod_h_bu (__m256i a, __m256i b, __m256i c)</h2>
+<h3 id="synopsis_102">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmaddwod_h_bu (__m256i a, __m256i b, __m256i c)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmaddwod.h.bu xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_102">Description</h3>
+<p>Multiply odd-positioned unsigned 8-bit elements in <code>b</code> and unsigned elements in <code>c</code>, add to 16-bit elements in <code>a</code>.</p>
+<h3 id="operation_102">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] =
+      (u16)(u8)b.byte[2 * i + 1] * (u16)(u8)c.byte[2 * i + 1] + (u16)a.half[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_102">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmaddwod_h_bu_b-__m256i-a-__m256i-b-__m256i-c">__m256i __lasx_xvmaddwod_h_bu_b (__m256i a, __m256i b, __m256i c)</h2>
+<h3 id="synopsis_103">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmaddwod_h_bu_b (__m256i a, __m256i b, __m256i c)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmaddwod.h.bu.b xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_103">Description</h3>
+<p>Multiply odd-positioned unsigned 8-bit elements in <code>b</code> and signed elements in <code>c</code>, add to 16-bit elements in <code>a</code>.</p>
+<h3 id="operation_103">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] =
+      (u16)(u8)b.byte[2 * i + 1] * (s16)(s8)c.byte[2 * i + 1] + (s16)a.half[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_103">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmaddwod_w_h-__m256i-a-__m256i-b-__m256i-c">__m256i __lasx_xvmaddwod_w_h (__m256i a, __m256i b, __m256i c)</h2>
+<h3 id="synopsis_104">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmaddwod_w_h (__m256i a, __m256i b, __m256i c)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmaddwod.w.h xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_104">Description</h3>
+<p>Multiply odd-positioned signed 16-bit elements in <code>b</code> and signed elements in <code>c</code>, add to 32-bit elements in <code>a</code>.</p>
+<h3 id="operation_104">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = (s32)(s16)b.half[2 * i + 1] * (s32)(s16)c.half[2 * i + 1] +
+                (s32)a.word[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_104">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmaddwod_w_hu-__m256i-a-__m256i-b-__m256i-c">__m256i __lasx_xvmaddwod_w_hu (__m256i a, __m256i b, __m256i c)</h2>
+<h3 id="synopsis_105">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmaddwod_w_hu (__m256i a, __m256i b, __m256i c)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmaddwod.w.hu xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_105">Description</h3>
+<p>Multiply odd-positioned unsigned 16-bit elements in <code>b</code> and unsigned elements in <code>c</code>, add to 32-bit elements in <code>a</code>.</p>
+<h3 id="operation_105">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = (u32)(u16)b.half[2 * i + 1] * (u32)(u16)c.half[2 * i + 1] +
+                (u32)a.word[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_105">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmaddwod_w_hu_h-__m256i-a-__m256i-b-__m256i-c">__m256i __lasx_xvmaddwod_w_hu_h (__m256i a, __m256i b, __m256i c)</h2>
+<h3 id="synopsis_106">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmaddwod_w_hu_h (__m256i a, __m256i b, __m256i c)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmaddwod.w.hu.h xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_106">Description</h3>
+<p>Multiply odd-positioned unsigned 16-bit elements in <code>b</code> and signed elements in <code>c</code>, add to 32-bit elements in <code>a</code>.</p>
+<h3 id="operation_106">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = (u32)(u16)b.half[2 * i + 1] * (s32)(s16)c.half[2 * i + 1] +
+                (s32)a.word[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_106">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmaddwod_d_w-__m256i-a-__m256i-b-__m256i-c">__m256i __lasx_xvmaddwod_d_w (__m256i a, __m256i b, __m256i c)</h2>
+<h3 id="synopsis_107">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmaddwod_d_w (__m256i a, __m256i b, __m256i c)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmaddwod.d.w xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_107">Description</h3>
+<p>Multiply odd-positioned signed 32-bit elements in <code>b</code> and signed elements in <code>c</code>, add to 64-bit elements in <code>a</code>.</p>
+<h3 id="operation_107">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = (s64)(s32)b.word[2 * i + 1] * (s64)(s32)c.word[2 * i + 1] +
+                 (s64)a.dword[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_107">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmaddwod_d_wu-__m256i-a-__m256i-b-__m256i-c">__m256i __lasx_xvmaddwod_d_wu (__m256i a, __m256i b, __m256i c)</h2>
+<h3 id="synopsis_108">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmaddwod_d_wu (__m256i a, __m256i b, __m256i c)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmaddwod.d.wu xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_108">Description</h3>
+<p>Multiply odd-positioned unsigned 32-bit elements in <code>b</code> and unsigned elements in <code>c</code>, add to 64-bit elements in <code>a</code>.</p>
+<h3 id="operation_108">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = (u64)(u32)b.word[2 * i + 1] * (u64)(u32)c.word[2 * i + 1] +
+                 (u64)a.dword[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_108">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmaddwod_d_wu_w-__m256i-a-__m256i-b-__m256i-c">__m256i __lasx_xvmaddwod_d_wu_w (__m256i a, __m256i b, __m256i c)</h2>
+<h3 id="synopsis_109">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmaddwod_d_wu_w (__m256i a, __m256i b, __m256i c)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmaddwod.d.wu.w xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_109">Description</h3>
+<p>Multiply odd-positioned unsigned 32-bit elements in <code>b</code> and signed elements in <code>c</code>, add to 64-bit elements in <code>a</code>.</p>
+<h3 id="operation_109">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = (u64)(u32)b.word[2 * i + 1] * (s64)(s32)c.word[2 * i + 1] +
+                 (s64)a.dword[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_109">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmaddwod_q_d-__m256i-a-__m256i-b-__m256i-c">__m256i __lasx_xvmaddwod_q_d (__m256i a, __m256i b, __m256i c)</h2>
+<h3 id="synopsis_110">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmaddwod_q_d (__m256i a, __m256i b, __m256i c)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmaddwod.q.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_110">Description</h3>
+<p>Multiply odd-positioned signed 64-bit elements in <code>b</code> and signed elements in <code>c</code>, add to 128-bit elements in <code>a</code>.</p>
+<h3 id="operation_110">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.qword[i] = (s128)(s64)b.dword[2 * i + 1] * (s128)(s64)c.dword[2 * i + 1] +
+                 (s128)a.qword[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_110">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>7</td>
+<td>1.14</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>7</td>
+<td>1.14</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmaddwod_q_du-__m256i-a-__m256i-b-__m256i-c">__m256i __lasx_xvmaddwod_q_du (__m256i a, __m256i b, __m256i c)</h2>
+<h3 id="synopsis_111">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmaddwod_q_du (__m256i a, __m256i b, __m256i c)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmaddwod.q.du xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_111">Description</h3>
+<p>Multiply odd-positioned unsigned 64-bit elements in <code>b</code> and unsigned elements in <code>c</code>, add to 128-bit elements in <code>a</code>.</p>
+<h3 id="operation_111">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.qword[i] = (u128)(u64)b.dword[2 * i + 1] * (u128)(u64)c.dword[2 * i + 1] +
+                 (u128)a.qword[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_111">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>7</td>
+<td>1.14</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>7</td>
+<td>1.14</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmaddwod_q_du_d-__m256i-a-__m256i-b-__m256i-c">__m256i __lasx_xvmaddwod_q_du_d (__m256i a, __m256i b, __m256i c)</h2>
+<h3 id="synopsis_112">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmaddwod_q_du_d (__m256i a, __m256i b, __m256i c)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmaddwod.q.du.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_112">Description</h3>
+<p>Multiply odd-positioned unsigned 64-bit elements in <code>b</code> and signed elements in <code>c</code>, add to 128-bit elements in <code>a</code>.</p>
+<h3 id="operation_112">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.qword[i] = (u128)(u64)b.dword[2 * i + 1] * (s128)(s64)c.dword[2 * i + 1] +
+                 (s128)a.qword[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_112">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>7</td>
+<td>1.14</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>7</td>
+<td>1.14</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmax_b-__m256i-a-__m256i-b">__m256i __lasx_xvmax_b (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_113">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmax_b (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmax.b xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_113">Description</h3>
+<p>Compute elementwise maximum for signed 8-bit elements in <code>a</code> and <code>b</code>.</p>
+<h3 id="operation_113">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 32; i++) {
+  dst.byte[i] = max((s8)a.byte[i], (s8)b.byte[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_113">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmax_bu-__m256i-a-__m256i-b">__m256i __lasx_xvmax_bu (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_114">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmax_bu (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmax.bu xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_114">Description</h3>
+<p>Compute elementwise maximum for unsigned 8-bit elements in <code>a</code> and <code>b</code>.</p>
+<h3 id="operation_114">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 32; i++) {
+  dst.byte[i] = max((u8)a.byte[i], (u8)b.byte[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_114">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmax_h-__m256i-a-__m256i-b">__m256i __lasx_xvmax_h (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_115">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmax_h (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmax.h xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_115">Description</h3>
+<p>Compute elementwise maximum for signed 16-bit elements in <code>a</code> and <code>b</code>.</p>
+<h3 id="operation_115">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = max((s16)a.half[i], (s16)b.half[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_115">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmax_hu-__m256i-a-__m256i-b">__m256i __lasx_xvmax_hu (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_116">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmax_hu (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmax.hu xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_116">Description</h3>
+<p>Compute elementwise maximum for unsigned 16-bit elements in <code>a</code> and <code>b</code>.</p>
+<h3 id="operation_116">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = max((u16)a.half[i], (u16)b.half[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_116">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmax_w-__m256i-a-__m256i-b">__m256i __lasx_xvmax_w (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_117">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmax_w (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmax.w xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_117">Description</h3>
+<p>Compute elementwise maximum for signed 32-bit elements in <code>a</code> and <code>b</code>.</p>
+<h3 id="operation_117">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = max((s32)a.word[i], (s32)b.word[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_117">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmax_wu-__m256i-a-__m256i-b">__m256i __lasx_xvmax_wu (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_118">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmax_wu (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmax.wu xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_118">Description</h3>
+<p>Compute elementwise maximum for unsigned 32-bit elements in <code>a</code> and <code>b</code>.</p>
+<h3 id="operation_118">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = max((u32)a.word[i], (u32)b.word[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_118">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmax_d-__m256i-a-__m256i-b">__m256i __lasx_xvmax_d (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_119">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmax_d (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmax.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_119">Description</h3>
+<p>Compute elementwise maximum for signed 64-bit elements in <code>a</code> and <code>b</code>.</p>
+<h3 id="operation_119">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = max((s64)a.dword[i], (s64)b.dword[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_119">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmax_du-__m256i-a-__m256i-b">__m256i __lasx_xvmax_du (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_120">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmax_du (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmax.du xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_120">Description</h3>
+<p>Compute elementwise maximum for unsigned 64-bit elements in <code>a</code> and <code>b</code>.</p>
+<h3 id="operation_120">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = max((u64)a.dword[i], (u64)b.dword[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_120">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmaxi_b-__m256i-a-imm_n16_15-imm">__m256i __lasx_xvmaxi_b (__m256i a, imm_n16_15 imm)</h2>
+<h3 id="synopsis_121">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmaxi_b (__m256i a, imm_n16_15 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmaxi.b xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_121">Description</h3>
+<p>Compute elementwise maximum for signed 8-bit elements in <code>a</code> and <code>imm</code>.</p>
+<h3 id="operation_121">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 32; i++) {
+  dst.byte[i] = max((s8)a.byte[i], (s8)imm);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_121">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmaxi_bu-__m256i-a-imm0_31-imm">__m256i __lasx_xvmaxi_bu (__m256i a, imm0_31 imm)</h2>
+<h3 id="synopsis_122">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmaxi_bu (__m256i a, imm0_31 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmaxi.bu xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_122">Description</h3>
+<p>Compute elementwise maximum for unsigned 8-bit elements in <code>a</code> and <code>imm</code>.</p>
+<h3 id="operation_122">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 32; i++) {
+  dst.byte[i] = max((u8)a.byte[i], (u8)imm);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_122">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmaxi_h-__m256i-a-imm_n16_15-imm">__m256i __lasx_xvmaxi_h (__m256i a, imm_n16_15 imm)</h2>
+<h3 id="synopsis_123">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmaxi_h (__m256i a, imm_n16_15 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmaxi.h xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_123">Description</h3>
+<p>Compute elementwise maximum for signed 16-bit elements in <code>a</code> and <code>imm</code>.</p>
+<h3 id="operation_123">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = max((s16)a.half[i], (s16)imm);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_123">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmaxi_hu-__m256i-a-imm0_31-imm">__m256i __lasx_xvmaxi_hu (__m256i a, imm0_31 imm)</h2>
+<h3 id="synopsis_124">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmaxi_hu (__m256i a, imm0_31 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmaxi.hu xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_124">Description</h3>
+<p>Compute elementwise maximum for unsigned 16-bit elements in <code>a</code> and <code>imm</code>.</p>
+<h3 id="operation_124">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = max((u16)a.half[i], (u16)imm);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_124">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmaxi_w-__m256i-a-imm_n16_15-imm">__m256i __lasx_xvmaxi_w (__m256i a, imm_n16_15 imm)</h2>
+<h3 id="synopsis_125">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmaxi_w (__m256i a, imm_n16_15 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmaxi.w xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_125">Description</h3>
+<p>Compute elementwise maximum for signed 32-bit elements in <code>a</code> and <code>imm</code>.</p>
+<h3 id="operation_125">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = max((s32)a.word[i], (s32)imm);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_125">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmaxi_wu-__m256i-a-imm0_31-imm">__m256i __lasx_xvmaxi_wu (__m256i a, imm0_31 imm)</h2>
+<h3 id="synopsis_126">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmaxi_wu (__m256i a, imm0_31 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmaxi.wu xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_126">Description</h3>
+<p>Compute elementwise maximum for unsigned 32-bit elements in <code>a</code> and <code>imm</code>.</p>
+<h3 id="operation_126">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = max((u32)a.word[i], (u32)imm);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_126">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmaxi_d-__m256i-a-imm_n16_15-imm">__m256i __lasx_xvmaxi_d (__m256i a, imm_n16_15 imm)</h2>
+<h3 id="synopsis_127">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmaxi_d (__m256i a, imm_n16_15 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmaxi.d xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_127">Description</h3>
+<p>Compute elementwise maximum for signed 64-bit elements in <code>a</code> and <code>imm</code>.</p>
+<h3 id="operation_127">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = max((s64)a.dword[i], (s64)imm);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_127">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmaxi_du-__m256i-a-imm0_31-imm">__m256i __lasx_xvmaxi_du (__m256i a, imm0_31 imm)</h2>
+<h3 id="synopsis_128">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmaxi_du (__m256i a, imm0_31 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmaxi.du xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_128">Description</h3>
+<p>Compute elementwise maximum for unsigned 64-bit elements in <code>a</code> and <code>imm</code>.</p>
+<h3 id="operation_128">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = max((u64)a.dword[i], (u64)imm);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_128">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmin_b-__m256i-a-__m256i-b">__m256i __lasx_xvmin_b (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_129">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmin_b (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmin.b xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_129">Description</h3>
+<p>Compute elementwise minimum for signed 8-bit elements in <code>a</code> and <code>b</code>.</p>
+<h3 id="operation_129">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 32; i++) {
+  dst.byte[i] = min((s8)a.byte[i], (s8)b.byte[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_129">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmin_bu-__m256i-a-__m256i-b">__m256i __lasx_xvmin_bu (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_130">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmin_bu (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmin.bu xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_130">Description</h3>
+<p>Compute elementwise minimum for unsigned 8-bit elements in <code>a</code> and <code>b</code>.</p>
+<h3 id="operation_130">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 32; i++) {
+  dst.byte[i] = min((u8)a.byte[i], (u8)b.byte[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_130">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmin_h-__m256i-a-__m256i-b">__m256i __lasx_xvmin_h (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_131">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmin_h (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmin.h xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_131">Description</h3>
+<p>Compute elementwise minimum for signed 16-bit elements in <code>a</code> and <code>b</code>.</p>
+<h3 id="operation_131">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = min((s16)a.half[i], (s16)b.half[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_131">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmin_hu-__m256i-a-__m256i-b">__m256i __lasx_xvmin_hu (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_132">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmin_hu (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmin.hu xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_132">Description</h3>
+<p>Compute elementwise minimum for unsigned 16-bit elements in <code>a</code> and <code>b</code>.</p>
+<h3 id="operation_132">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = min((u16)a.half[i], (u16)b.half[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_132">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmin_w-__m256i-a-__m256i-b">__m256i __lasx_xvmin_w (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_133">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmin_w (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmin.w xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_133">Description</h3>
+<p>Compute elementwise minimum for signed 32-bit elements in <code>a</code> and <code>b</code>.</p>
+<h3 id="operation_133">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = min((s32)a.word[i], (s32)b.word[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_133">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmin_wu-__m256i-a-__m256i-b">__m256i __lasx_xvmin_wu (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_134">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmin_wu (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmin.wu xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_134">Description</h3>
+<p>Compute elementwise minimum for unsigned 32-bit elements in <code>a</code> and <code>b</code>.</p>
+<h3 id="operation_134">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = min((u32)a.word[i], (u32)b.word[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_134">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmin_d-__m256i-a-__m256i-b">__m256i __lasx_xvmin_d (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_135">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmin_d (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmin.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_135">Description</h3>
+<p>Compute elementwise minimum for signed 64-bit elements in <code>a</code> and <code>b</code>.</p>
+<h3 id="operation_135">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = min((s64)a.dword[i], (s64)b.dword[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_135">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmin_du-__m256i-a-__m256i-b">__m256i __lasx_xvmin_du (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_136">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmin_du (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmin.du xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_136">Description</h3>
+<p>Compute elementwise minimum for unsigned 64-bit elements in <code>a</code> and <code>b</code>.</p>
+<h3 id="operation_136">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = min((u64)a.dword[i], (u64)b.dword[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_136">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmini_b-__m256i-a-imm_n16_15-imm">__m256i __lasx_xvmini_b (__m256i a, imm_n16_15 imm)</h2>
+<h3 id="synopsis_137">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmini_b (__m256i a, imm_n16_15 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmini.b xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_137">Description</h3>
+<p>Compute elementwise minimum for signed 8-bit elements in <code>a</code> and <code>imm</code>.</p>
+<h3 id="operation_137">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 32; i++) {
+  dst.byte[i] = min((s8)a.byte[i], (s8)imm);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_137">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmini_bu-__m256i-a-imm0_31-imm">__m256i __lasx_xvmini_bu (__m256i a, imm0_31 imm)</h2>
+<h3 id="synopsis_138">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmini_bu (__m256i a, imm0_31 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmini.bu xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_138">Description</h3>
+<p>Compute elementwise minimum for unsigned 8-bit elements in <code>a</code> and <code>imm</code>.</p>
+<h3 id="operation_138">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 32; i++) {
+  dst.byte[i] = min((u8)a.byte[i], (u8)imm);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_138">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmini_h-__m256i-a-imm_n16_15-imm">__m256i __lasx_xvmini_h (__m256i a, imm_n16_15 imm)</h2>
+<h3 id="synopsis_139">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmini_h (__m256i a, imm_n16_15 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmini.h xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_139">Description</h3>
+<p>Compute elementwise minimum for signed 16-bit elements in <code>a</code> and <code>imm</code>.</p>
+<h3 id="operation_139">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = min((s16)a.half[i], (s16)imm);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_139">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmini_hu-__m256i-a-imm0_31-imm">__m256i __lasx_xvmini_hu (__m256i a, imm0_31 imm)</h2>
+<h3 id="synopsis_140">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmini_hu (__m256i a, imm0_31 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmini.hu xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_140">Description</h3>
+<p>Compute elementwise minimum for unsigned 16-bit elements in <code>a</code> and <code>imm</code>.</p>
+<h3 id="operation_140">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = min((u16)a.half[i], (u16)imm);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_140">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmini_w-__m256i-a-imm_n16_15-imm">__m256i __lasx_xvmini_w (__m256i a, imm_n16_15 imm)</h2>
+<h3 id="synopsis_141">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmini_w (__m256i a, imm_n16_15 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmini.w xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_141">Description</h3>
+<p>Compute elementwise minimum for signed 32-bit elements in <code>a</code> and <code>imm</code>.</p>
+<h3 id="operation_141">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = min((s32)a.word[i], (s32)imm);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_141">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmini_wu-__m256i-a-imm0_31-imm">__m256i __lasx_xvmini_wu (__m256i a, imm0_31 imm)</h2>
+<h3 id="synopsis_142">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmini_wu (__m256i a, imm0_31 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmini.wu xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_142">Description</h3>
+<p>Compute elementwise minimum for unsigned 32-bit elements in <code>a</code> and <code>imm</code>.</p>
+<h3 id="operation_142">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = min((u32)a.word[i], (u32)imm);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_142">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmini_d-__m256i-a-imm_n16_15-imm">__m256i __lasx_xvmini_d (__m256i a, imm_n16_15 imm)</h2>
+<h3 id="synopsis_143">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmini_d (__m256i a, imm_n16_15 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmini.d xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_143">Description</h3>
+<p>Compute elementwise minimum for signed 64-bit elements in <code>a</code> and <code>imm</code>.</p>
+<h3 id="operation_143">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = min((s64)a.dword[i], (s64)imm);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_143">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmini_du-__m256i-a-imm0_31-imm">__m256i __lasx_xvmini_du (__m256i a, imm0_31 imm)</h2>
+<h3 id="synopsis_144">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmini_du (__m256i a, imm0_31 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmini.du xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_144">Description</h3>
+<p>Compute elementwise minimum for unsigned 64-bit elements in <code>a</code> and <code>imm</code>.</p>
+<h3 id="operation_144">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = min((u64)a.dword[i], (u64)imm);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_144">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmod_b-__m256i-a-__m256i-b">__m256i __lasx_xvmod_b (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_145">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmod_b (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmod.b xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_145">Description</h3>
+<p>Modulo residual signed 8-bit elements in <code>a</code> by elements in <code>b</code>.</p>
+<h3 id="operation_145">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 32; i++) {
+  dst.byte[i] = (b.byte[i] == 0) ? 0 : ((s8)a.byte[i] % (s8)b.byte[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_145">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>29, 41</td>
+<td>0.06(1/15.5)</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>29, 33</td>
+<td>0.05(1/21.5)</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmod_bu-__m256i-a-__m256i-b">__m256i __lasx_xvmod_bu (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_146">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmod_bu (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmod.bu xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_146">Description</h3>
+<p>Modulo residual unsigned 8-bit elements in <code>a</code> by elements in <code>b</code>.</p>
+<h3 id="operation_146">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 32; i++) {
+  dst.byte[i] = (b.byte[i] == 0) ? 0 : ((u8)a.byte[i] % (u8)b.byte[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_146">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>29, 37</td>
+<td>0.06(1/17.5)</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>29, 37</td>
+<td>0.05(1/22)</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmod_h-__m256i-a-__m256i-b">__m256i __lasx_xvmod_h (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_147">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmod_h (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmod.h xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_147">Description</h3>
+<p>Modulo residual signed 16-bit elements in <code>a</code> by elements in <code>b</code>.</p>
+<h3 id="operation_147">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = (b.half[i] == 0) ? 0 : ((s16)a.half[i] % (s16)b.half[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_147">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>17, 21</td>
+<td>0.12(1/8.5)</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>17, 21</td>
+<td>0.07(1/13.5)</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmod_hu-__m256i-a-__m256i-b">__m256i __lasx_xvmod_hu (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_148">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmod_hu (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmod.hu xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_148">Description</h3>
+<p>Modulo residual unsigned 16-bit elements in <code>a</code> by elements in <code>b</code>.</p>
+<h3 id="operation_148">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = (b.half[i] == 0) ? 0 : ((u16)a.half[i] % (u16)b.half[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_148">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>17, 25</td>
+<td>0.11(1/9.5)</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>17, 23</td>
+<td>0.06(1/16)</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmod_w-__m256i-a-__m256i-b">__m256i __lasx_xvmod_w (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_149">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmod_w (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmod.w xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_149">Description</h3>
+<p>Modulo residual signed 32-bit elements in <code>a</code> by elements in <code>b</code>.</p>
+<h3 id="operation_149">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = (b.word[i] == 0) ? 0 : ((s32)a.word[i] % (s32)b.word[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_149">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>11, 13</td>
+<td>0.18(1/5.5)</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>11, 15</td>
+<td>0.07(1/13.5)</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmod_wu-__m256i-a-__m256i-b">__m256i __lasx_xvmod_wu (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_150">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmod_wu (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmod.wu xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_150">Description</h3>
+<p>Modulo residual unsigned 32-bit elements in <code>a</code> by elements in <code>b</code>.</p>
+<h3 id="operation_150">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = (b.word[i] == 0) ? 0 : ((u32)a.word[i] % (u32)b.word[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_150">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>11, 13</td>
+<td>0.18(1/5.5)</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>11, 15</td>
+<td>0.06(1/16)</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmod_d-__m256i-a-__m256i-b">__m256i __lasx_xvmod_d (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_151">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmod_d (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmod.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_151">Description</h3>
+<p>Modulo residual signed 64-bit elements in <code>a</code> by elements in <code>b</code>.</p>
+<h3 id="operation_151">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = (b.dword[i] == 0) ? 0 : ((s64)a.dword[i] % (s64)b.dword[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_151">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>8, 10</td>
+<td>0.25(1/4)</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>8, 10</td>
+<td>0.11(1/9.5)</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmod_du-__m256i-a-__m256i-b">__m256i __lasx_xvmod_du (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_152">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmod_du (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmod.du xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_152">Description</h3>
+<p>Modulo residual unsigned 64-bit elements in <code>a</code> by elements in <code>b</code>.</p>
+<h3 id="operation_152">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = (b.dword[i] == 0) ? 0 : ((u64)a.dword[i] % (u64)b.dword[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_152">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>8, 10</td>
+<td>0.25(1/4)</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>8, 10</td>
+<td>0.11(1/9.5)</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmsub_b-__m256i-a-__m256i-b-__m256i-c">__m256i __lasx_xvmsub_b (__m256i a, __m256i b, __m256i c)</h2>
+<h3 id="synopsis_153">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmsub_b (__m256i a, __m256i b, __m256i c)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmsub.b xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_153">Description</h3>
+<p>Multiply 8-bit elements in <code>b</code> and <code>c</code>, negate and add elements in <code>a</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_153">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 32; i++) {
+  dst.byte[i] = -b.byte[i] * c.byte[i] + a.byte[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_153">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmsub_h-__m256i-a-__m256i-b-__m256i-c">__m256i __lasx_xvmsub_h (__m256i a, __m256i b, __m256i c)</h2>
+<h3 id="synopsis_154">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmsub_h (__m256i a, __m256i b, __m256i c)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmsub.h xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_154">Description</h3>
+<p>Multiply 16-bit elements in <code>b</code> and <code>c</code>, negate and add elements in <code>a</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_154">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = -b.half[i] * c.half[i] + a.half[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_154">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmsub_w-__m256i-a-__m256i-b-__m256i-c">__m256i __lasx_xvmsub_w (__m256i a, __m256i b, __m256i c)</h2>
+<h3 id="synopsis_155">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmsub_w (__m256i a, __m256i b, __m256i c)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmsub.w xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_155">Description</h3>
+<p>Multiply 32-bit elements in <code>b</code> and <code>c</code>, negate and add elements in <code>a</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_155">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = -b.word[i] * c.word[i] + a.word[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_155">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmsub_d-__m256i-a-__m256i-b-__m256i-c">__m256i __lasx_xvmsub_d (__m256i a, __m256i b, __m256i c)</h2>
+<h3 id="synopsis_156">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmsub_d (__m256i a, __m256i b, __m256i c)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmsub.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_156">Description</h3>
+<p>Multiply 64-bit elements in <code>b</code> and <code>c</code>, negate and add elements in <code>a</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_156">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = -b.dword[i] * c.dword[i] + a.dword[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_156">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmuh_b-__m256i-a-__m256i-b">__m256i __lasx_xvmuh_b (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_157">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmuh_b (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmuh.b xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_157">Description</h3>
+<p>Multiply signed 8-bit elements in <code>a</code> and <code>b</code>, save the high 8-bit result in <code>dst</code>.</p>
+<h3 id="operation_157">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 32; i++) {
+  dst.byte[i] = (((s16)(s8)a.byte[i] * (s16)(s8)b.byte[i])) &gt;&gt; 8;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_157">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmuh_bu-__m256i-a-__m256i-b">__m256i __lasx_xvmuh_bu (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_158">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmuh_bu (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmuh.bu xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_158">Description</h3>
+<p>Multiply unsigned 8-bit elements in <code>a</code> and <code>b</code>, save the high 8-bit result in <code>dst</code>.</p>
+<h3 id="operation_158">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 32; i++) {
+  dst.byte[i] = (((u16)(u8)a.byte[i] * (u16)(u8)b.byte[i])) &gt;&gt; 8;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_158">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmuh_h-__m256i-a-__m256i-b">__m256i __lasx_xvmuh_h (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_159">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmuh_h (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmuh.h xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_159">Description</h3>
+<p>Multiply signed 16-bit elements in <code>a</code> and <code>b</code>, save the high 16-bit result in <code>dst</code>.</p>
+<h3 id="operation_159">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = (((s32)(s16)a.half[i] * (s32)(s16)b.half[i])) &gt;&gt; 16;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_159">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmuh_hu-__m256i-a-__m256i-b">__m256i __lasx_xvmuh_hu (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_160">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmuh_hu (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmuh.hu xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_160">Description</h3>
+<p>Multiply unsigned 16-bit elements in <code>a</code> and <code>b</code>, save the high 16-bit result in <code>dst</code>.</p>
+<h3 id="operation_160">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = (((u32)(u16)a.half[i] * (u32)(u16)b.half[i])) &gt;&gt; 16;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_160">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmuh_w-__m256i-a-__m256i-b">__m256i __lasx_xvmuh_w (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_161">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmuh_w (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmuh.w xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_161">Description</h3>
+<p>Multiply signed 32-bit elements in <code>a</code> and <code>b</code>, save the high 32-bit result in <code>dst</code>.</p>
+<h3 id="operation_161">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = (((s64)(s32)a.word[i] * (s64)(s32)b.word[i])) &gt;&gt; 32;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_161">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmuh_wu-__m256i-a-__m256i-b">__m256i __lasx_xvmuh_wu (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_162">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmuh_wu (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmuh.wu xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_162">Description</h3>
+<p>Multiply unsigned 32-bit elements in <code>a</code> and <code>b</code>, save the high 32-bit result in <code>dst</code>.</p>
+<h3 id="operation_162">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = (((u64)(u32)a.word[i] * (u64)(u32)b.word[i])) &gt;&gt; 32;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_162">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmuh_d-__m256i-a-__m256i-b">__m256i __lasx_xvmuh_d (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_163">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmuh_d (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmuh.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_163">Description</h3>
+<p>Multiply signed 64-bit elements in <code>a</code> and <code>b</code>, save the high 64-bit result in <code>dst</code>.</p>
+<h3 id="operation_163">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = (((s128)(s64)a.dword[i] * (s128)(s64)b.dword[i])) &gt;&gt; 64;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_163">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmuh_du-__m256i-a-__m256i-b">__m256i __lasx_xvmuh_du (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_164">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmuh_du (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmuh.du xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_164">Description</h3>
+<p>Multiply unsigned 64-bit elements in <code>a</code> and <code>b</code>, save the high 64-bit result in <code>dst</code>.</p>
+<h3 id="operation_164">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = (((u128)(u64)a.dword[i] * (u128)(u64)b.dword[i])) &gt;&gt; 64;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_164">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmul_b-__m256i-a-__m256i-b">__m256i __lasx_xvmul_b (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_165">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmul_b (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmul.b xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_165">Description</h3>
+<p>Multiply 8-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_165">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 32; i++) {
+  dst.byte[i] = a.byte[i] * b.byte[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_165">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmul_h-__m256i-a-__m256i-b">__m256i __lasx_xvmul_h (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_166">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmul_h (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmul.h xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_166">Description</h3>
+<p>Multiply 16-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_166">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = a.half[i] * b.half[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_166">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmul_w-__m256i-a-__m256i-b">__m256i __lasx_xvmul_w (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_167">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmul_w (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmul.w xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_167">Description</h3>
+<p>Multiply 32-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_167">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = a.word[i] * b.word[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_167">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmul_d-__m256i-a-__m256i-b">__m256i __lasx_xvmul_d (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_168">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmul_d (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmul.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_168">Description</h3>
+<p>Multiply 64-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_168">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = a.dword[i] * b.dword[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_168">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmulwev_h_b-__m256i-a-__m256i-b">__m256i __lasx_xvmulwev_h_b (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_169">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmulwev_h_b (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmulwev.h.b xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_169">Description</h3>
+<p>Multiply even-positioned signed 8-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 16-bit result in <code>dst</code>.</p>
+<h3 id="operation_169">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = (s16)(s8)a.byte[2 * i] * (s16)(s8)b.byte[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_169">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmulwev_h_bu-__m256i-a-__m256i-b">__m256i __lasx_xvmulwev_h_bu (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_170">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmulwev_h_bu (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmulwev.h.bu xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_170">Description</h3>
+<p>Multiply even-positioned unsigned 8-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 16-bit result in <code>dst</code>.</p>
+<h3 id="operation_170">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = (u16)(u8)a.byte[2 * i] * (u16)(u8)b.byte[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_170">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmulwev_h_bu_b-__m256i-a-__m256i-b">__m256i __lasx_xvmulwev_h_bu_b (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_171">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmulwev_h_bu_b (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmulwev.h.bu.b xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_171">Description</h3>
+<p>Multiply even-positioned unsigned 8-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 16-bit result in <code>dst</code>.</p>
+<h3 id="operation_171">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = (u16)(u8)a.byte[2 * i] * (s16)(s8)b.byte[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_171">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmulwev_w_h-__m256i-a-__m256i-b">__m256i __lasx_xvmulwev_w_h (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_172">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmulwev_w_h (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmulwev.w.h xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_172">Description</h3>
+<p>Multiply even-positioned signed 16-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 32-bit result in <code>dst</code>.</p>
+<h3 id="operation_172">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = (s32)(s16)a.half[2 * i] * (s32)(s16)b.half[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_172">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmulwev_w_hu-__m256i-a-__m256i-b">__m256i __lasx_xvmulwev_w_hu (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_173">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmulwev_w_hu (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmulwev.w.hu xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_173">Description</h3>
+<p>Multiply even-positioned unsigned 16-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 32-bit result in <code>dst</code>.</p>
+<h3 id="operation_173">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = (u32)(u16)a.half[2 * i] * (u32)(u16)b.half[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_173">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmulwev_w_hu_h-__m256i-a-__m256i-b">__m256i __lasx_xvmulwev_w_hu_h (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_174">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmulwev_w_hu_h (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmulwev.w.hu.h xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_174">Description</h3>
+<p>Multiply even-positioned unsigned 16-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 32-bit result in <code>dst</code>.</p>
+<h3 id="operation_174">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = (u32)(u16)a.half[2 * i] * (s32)(s16)b.half[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_174">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmulwev_d_w-__m256i-a-__m256i-b">__m256i __lasx_xvmulwev_d_w (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_175">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmulwev_d_w (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmulwev.d.w xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_175">Description</h3>
+<p>Multiply even-positioned signed 32-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 64-bit result in <code>dst</code>.</p>
+<h3 id="operation_175">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = (s64)(s32)a.word[2 * i] * (s64)(s32)b.word[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_175">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmulwev_d_wu-__m256i-a-__m256i-b">__m256i __lasx_xvmulwev_d_wu (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_176">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmulwev_d_wu (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmulwev.d.wu xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_176">Description</h3>
+<p>Multiply even-positioned unsigned 32-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 64-bit result in <code>dst</code>.</p>
+<h3 id="operation_176">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = (u64)(u32)a.word[2 * i] * (u64)(u32)b.word[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_176">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmulwev_d_wu_w-__m256i-a-__m256i-b">__m256i __lasx_xvmulwev_d_wu_w (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_177">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmulwev_d_wu_w (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmulwev.d.wu.w xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_177">Description</h3>
+<p>Multiply even-positioned unsigned 32-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 64-bit result in <code>dst</code>.</p>
+<h3 id="operation_177">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = (u64)(u32)a.word[2 * i] * (s64)(s32)b.word[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_177">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmulwev_q_d-__m256i-a-__m256i-b">__m256i __lasx_xvmulwev_q_d (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_178">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmulwev_q_d (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmulwev.q.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_178">Description</h3>
+<p>Multiply even-positioned signed 64-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 128-bit result in <code>dst</code>.</p>
+<h3 id="operation_178">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.qword[i] = (s128)(s64)a.dword[2 * i] * (s128)(s64)b.dword[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_178">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>7</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>7</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmulwev_q_du-__m256i-a-__m256i-b">__m256i __lasx_xvmulwev_q_du (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_179">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmulwev_q_du (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmulwev.q.du xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_179">Description</h3>
+<p>Multiply even-positioned unsigned 64-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 128-bit result in <code>dst</code>.</p>
+<h3 id="operation_179">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[2 * i] * (u128)(u64)b.dword[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_179">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>7</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>7</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmulwev_q_du_d-__m256i-a-__m256i-b">__m256i __lasx_xvmulwev_q_du_d (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_180">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmulwev_q_du_d (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmulwev.q.du.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_180">Description</h3>
+<p>Multiply even-positioned unsigned 64-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 128-bit result in <code>dst</code>.</p>
+<h3 id="operation_180">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[2 * i] * (s128)(s64)b.dword[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_180">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>7</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>7</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmulwod_h_b-__m256i-a-__m256i-b">__m256i __lasx_xvmulwod_h_b (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_181">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmulwod_h_b (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmulwod.h.b xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_181">Description</h3>
+<p>Multiply odd-positioned signed 8-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 16-bit result in <code>dst</code>.</p>
+<h3 id="operation_181">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = (s16)(s8)a.byte[2 * i + 1] * (s16)(s8)b.byte[2 * i + 1];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_181">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmulwod_h_bu-__m256i-a-__m256i-b">__m256i __lasx_xvmulwod_h_bu (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_182">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmulwod_h_bu (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmulwod.h.bu xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_182">Description</h3>
+<p>Multiply odd-positioned unsigned 8-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 16-bit result in <code>dst</code>.</p>
+<h3 id="operation_182">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = (u16)(u8)a.byte[2 * i + 1] * (u16)(u8)b.byte[2 * i + 1];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_182">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmulwod_h_bu_b-__m256i-a-__m256i-b">__m256i __lasx_xvmulwod_h_bu_b (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_183">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmulwod_h_bu_b (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmulwod.h.bu.b xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_183">Description</h3>
+<p>Multiply odd-positioned unsigned 8-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 16-bit result in <code>dst</code>.</p>
+<h3 id="operation_183">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = (u16)(u8)a.byte[2 * i + 1] * (s16)(s8)b.byte[2 * i + 1];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_183">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmulwod_w_h-__m256i-a-__m256i-b">__m256i __lasx_xvmulwod_w_h (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_184">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmulwod_w_h (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmulwod.w.h xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_184">Description</h3>
+<p>Multiply odd-positioned signed 16-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 32-bit result in <code>dst</code>.</p>
+<h3 id="operation_184">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = (s32)(s16)a.half[2 * i + 1] * (s32)(s16)b.half[2 * i + 1];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_184">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmulwod_w_hu-__m256i-a-__m256i-b">__m256i __lasx_xvmulwod_w_hu (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_185">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmulwod_w_hu (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmulwod.w.hu xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_185">Description</h3>
+<p>Multiply odd-positioned unsigned 16-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 32-bit result in <code>dst</code>.</p>
+<h3 id="operation_185">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = (u32)(u16)a.half[2 * i + 1] * (u32)(u16)b.half[2 * i + 1];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_185">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmulwod_w_hu_h-__m256i-a-__m256i-b">__m256i __lasx_xvmulwod_w_hu_h (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_186">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmulwod_w_hu_h (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmulwod.w.hu.h xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_186">Description</h3>
+<p>Multiply odd-positioned unsigned 16-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 32-bit result in <code>dst</code>.</p>
+<h3 id="operation_186">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = (u32)(u16)a.half[2 * i + 1] * (s32)(s16)b.half[2 * i + 1];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_186">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmulwod_d_w-__m256i-a-__m256i-b">__m256i __lasx_xvmulwod_d_w (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_187">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmulwod_d_w (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmulwod.d.w xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_187">Description</h3>
+<p>Multiply odd-positioned signed 32-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 64-bit result in <code>dst</code>.</p>
+<h3 id="operation_187">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = (s64)(s32)a.word[2 * i + 1] * (s64)(s32)b.word[2 * i + 1];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_187">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmulwod_d_wu-__m256i-a-__m256i-b">__m256i __lasx_xvmulwod_d_wu (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_188">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmulwod_d_wu (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmulwod.d.wu xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_188">Description</h3>
+<p>Multiply odd-positioned unsigned 32-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 64-bit result in <code>dst</code>.</p>
+<h3 id="operation_188">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = (u64)(u32)a.word[2 * i + 1] * (u64)(u32)b.word[2 * i + 1];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_188">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmulwod_d_wu_w-__m256i-a-__m256i-b">__m256i __lasx_xvmulwod_d_wu_w (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_189">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmulwod_d_wu_w (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmulwod.d.wu.w xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_189">Description</h3>
+<p>Multiply odd-positioned unsigned 32-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 64-bit result in <code>dst</code>.</p>
+<h3 id="operation_189">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = (u64)(u32)a.word[2 * i + 1] * (s64)(s32)b.word[2 * i + 1];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_189">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmulwod_q_d-__m256i-a-__m256i-b">__m256i __lasx_xvmulwod_q_d (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_190">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmulwod_q_d (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmulwod.q.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_190">Description</h3>
+<p>Multiply odd-positioned signed 64-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 128-bit result in <code>dst</code>.</p>
+<h3 id="operation_190">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.qword[i] = (s128)(s64)a.dword[2 * i + 1] * (s128)(s64)b.dword[2 * i + 1];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_190">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>7</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>7</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmulwod_q_du-__m256i-a-__m256i-b">__m256i __lasx_xvmulwod_q_du (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_191">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmulwod_q_du (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmulwod.q.du xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_191">Description</h3>
+<p>Multiply odd-positioned unsigned 64-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 128-bit result in <code>dst</code>.</p>
+<h3 id="operation_191">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] * (u128)(u64)b.dword[2 * i + 1];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_191">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>7</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>7</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmulwod_q_du_d-__m256i-a-__m256i-b">__m256i __lasx_xvmulwod_q_du_d (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_192">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmulwod_q_du_d (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmulwod.q.du.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_192">Description</h3>
+<p>Multiply odd-positioned unsigned 64-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 128-bit result in <code>dst</code>.</p>
+<h3 id="operation_192">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] * (s128)(s64)b.dword[2 * i + 1];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_192">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>7</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>7</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvneg_b-__m256i-a">__m256i __lasx_xvneg_b (__m256i a)</h2>
+<h3 id="synopsis_193">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvneg_b (__m256i a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvneg.b xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_193">Description</h3>
+<p>Negate 8-bit elements in <code>a</code> and save the result in <code>dst</code>.</p>
+<h3 id="operation_193">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 32; i++) {
+  dst.byte[i] = -a.byte[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_193">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvneg_h-__m256i-a">__m256i __lasx_xvneg_h (__m256i a)</h2>
+<h3 id="synopsis_194">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvneg_h (__m256i a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvneg.h xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_194">Description</h3>
+<p>Negate 16-bit elements in <code>a</code> and save the result in <code>dst</code>.</p>
+<h3 id="operation_194">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = -a.half[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_194">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvneg_w-__m256i-a">__m256i __lasx_xvneg_w (__m256i a)</h2>
+<h3 id="synopsis_195">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvneg_w (__m256i a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvneg.w xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_195">Description</h3>
+<p>Negate 32-bit elements in <code>a</code> and save the result in <code>dst</code>.</p>
+<h3 id="operation_195">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = -a.word[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_195">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvneg_d-__m256i-a">__m256i __lasx_xvneg_d (__m256i a)</h2>
+<h3 id="synopsis_196">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvneg_d (__m256i a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvneg.d xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_196">Description</h3>
+<p>Negate 64-bit elements in <code>a</code> and save the result in <code>dst</code>.</p>
+<h3 id="operation_196">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = -a.dword[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_196">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsadd_b-__m256i-a-__m256i-b">__m256i __lasx_xvsadd_b (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_197">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsadd_b (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsadd.b xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_197">Description</h3>
+<p>Saturing add the signed 8-bit elements in <code>a</code> and <code>b</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_197">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 32; i++) {
+  dst.byte[i] = (s8)sadd((s8)a.byte[i], (s8)b.byte[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_197">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsadd_bu-__m256i-a-__m256i-b">__m256i __lasx_xvsadd_bu (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_198">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsadd_bu (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsadd.bu xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_198">Description</h3>
+<p>Saturing add the unsigned 8-bit elements in <code>a</code> and <code>b</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_198">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 32; i++) {
+  dst.byte[i] = (u8)sadd((u8)a.byte[i], (u8)b.byte[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_198">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsadd_h-__m256i-a-__m256i-b">__m256i __lasx_xvsadd_h (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_199">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsadd_h (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsadd.h xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_199">Description</h3>
+<p>Saturing add the signed 16-bit elements in <code>a</code> and <code>b</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_199">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = (s16)sadd((s16)a.half[i], (s16)b.half[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_199">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsadd_hu-__m256i-a-__m256i-b">__m256i __lasx_xvsadd_hu (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_200">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsadd_hu (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsadd.hu xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_200">Description</h3>
+<p>Saturing add the unsigned 16-bit elements in <code>a</code> and <code>b</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_200">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = (u16)sadd((u16)a.half[i], (u16)b.half[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_200">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsadd_w-__m256i-a-__m256i-b">__m256i __lasx_xvsadd_w (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_201">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsadd_w (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsadd.w xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_201">Description</h3>
+<p>Saturing add the signed 32-bit elements in <code>a</code> and <code>b</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_201">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = (s32)sadd((s32)a.word[i], (s32)b.word[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_201">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsadd_wu-__m256i-a-__m256i-b">__m256i __lasx_xvsadd_wu (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_202">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsadd_wu (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsadd.wu xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_202">Description</h3>
+<p>Saturing add the unsigned 32-bit elements in <code>a</code> and <code>b</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_202">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = (u32)sadd((u32)a.word[i], (u32)b.word[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_202">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsadd_d-__m256i-a-__m256i-b">__m256i __lasx_xvsadd_d (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_203">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsadd_d (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsadd.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_203">Description</h3>
+<p>Saturing add the signed 64-bit elements in <code>a</code> and <code>b</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_203">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = (s64)sadd((s64)a.dword[i], (s64)b.dword[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_203">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsadd_du-__m256i-a-__m256i-b">__m256i __lasx_xvsadd_du (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_204">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsadd_du (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsadd.du xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_204">Description</h3>
+<p>Saturing add the unsigned 64-bit elements in <code>a</code> and <code>b</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_204">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = (u64)sadd((u64)a.dword[i], (u64)b.dword[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_204">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvssub_b-__m256i-a-__m256i-b">__m256i __lasx_xvssub_b (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_205">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvssub_b (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvssub.b xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_205">Description</h3>
+<p>Saturing subtract the signed 8-bit elements in <code>a</code> and <code>b</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_205">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 32; i++) {
+  dst.byte[i] = (s8)ssub((s8)a.byte[i], (s8)b.byte[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_205">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvssub_bu-__m256i-a-__m256i-b">__m256i __lasx_xvssub_bu (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_206">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvssub_bu (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvssub.bu xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_206">Description</h3>
+<p>Saturing subtract the unsigned 8-bit elements in <code>a</code> and <code>b</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_206">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 32; i++) {
+  dst.byte[i] = (u8)ssub((u8)a.byte[i], (u8)b.byte[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_206">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvssub_h-__m256i-a-__m256i-b">__m256i __lasx_xvssub_h (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_207">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvssub_h (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvssub.h xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_207">Description</h3>
+<p>Saturing subtract the signed 16-bit elements in <code>a</code> and <code>b</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_207">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = (s16)ssub((s16)a.half[i], (s16)b.half[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_207">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvssub_hu-__m256i-a-__m256i-b">__m256i __lasx_xvssub_hu (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_208">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvssub_hu (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvssub.hu xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_208">Description</h3>
+<p>Saturing subtract the unsigned 16-bit elements in <code>a</code> and <code>b</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_208">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = (u16)ssub((u16)a.half[i], (u16)b.half[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_208">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvssub_w-__m256i-a-__m256i-b">__m256i __lasx_xvssub_w (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_209">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvssub_w (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvssub.w xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_209">Description</h3>
+<p>Saturing subtract the signed 32-bit elements in <code>a</code> and <code>b</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_209">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = (s32)ssub((s32)a.word[i], (s32)b.word[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_209">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvssub_wu-__m256i-a-__m256i-b">__m256i __lasx_xvssub_wu (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_210">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvssub_wu (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvssub.wu xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_210">Description</h3>
+<p>Saturing subtract the unsigned 32-bit elements in <code>a</code> and <code>b</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_210">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = (u32)ssub((u32)a.word[i], (u32)b.word[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_210">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvssub_d-__m256i-a-__m256i-b">__m256i __lasx_xvssub_d (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_211">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvssub_d (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvssub.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_211">Description</h3>
+<p>Saturing subtract the signed 64-bit elements in <code>a</code> and <code>b</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_211">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = (s64)ssub((s64)a.dword[i], (s64)b.dword[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_211">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvssub_du-__m256i-a-__m256i-b">__m256i __lasx_xvssub_du (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_212">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvssub_du (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvssub.du xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_212">Description</h3>
+<p>Saturing subtract the unsigned 64-bit elements in <code>a</code> and <code>b</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_212">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = (u64)ssub((u64)a.dword[i], (u64)b.dword[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_212">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsub_b-__m256i-a-__m256i-b">__m256i __lasx_xvsub_b (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_213">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsub_b (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsub.b xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_213">Description</h3>
+<p>Subtract 8-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_213">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 32; i++) {
+  dst.byte[i] = a.byte[i] - b.byte[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_213">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsub_h-__m256i-a-__m256i-b">__m256i __lasx_xvsub_h (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_214">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsub_h (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsub.h xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_214">Description</h3>
+<p>Subtract 16-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_214">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = a.half[i] - b.half[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_214">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsub_w-__m256i-a-__m256i-b">__m256i __lasx_xvsub_w (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_215">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsub_w (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsub.w xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_215">Description</h3>
+<p>Subtract 32-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_215">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = a.word[i] - b.word[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_215">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsub_d-__m256i-a-__m256i-b">__m256i __lasx_xvsub_d (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_216">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsub_d (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsub.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_216">Description</h3>
+<p>Subtract 64-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_216">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = a.dword[i] - b.dword[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_216">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsub_q-__m256i-a-__m256i-b">__m256i __lasx_xvsub_q (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_217">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsub_q (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsub.q xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_217">Description</h3>
+<p>Subtract 128-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_217">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.qword[i] = a.qword[i] - b.qword[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_217">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsubi_bu-__m256i-a-imm0_31-imm">__m256i __lasx_xvsubi_bu (__m256i a, imm0_31 imm)</h2>
+<h3 id="synopsis_218">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsubi_bu (__m256i a, imm0_31 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsubi.bu xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_218">Description</h3>
+<p>Subtract 8-bit elements in <code>a</code> by <code>imm</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_218">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 32; i++) {
+  dst.byte[i] = a.byte[i] - imm;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_218">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsubi_hu-__m256i-a-imm0_31-imm">__m256i __lasx_xvsubi_hu (__m256i a, imm0_31 imm)</h2>
+<h3 id="synopsis_219">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsubi_hu (__m256i a, imm0_31 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsubi.hu xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_219">Description</h3>
+<p>Subtract 16-bit elements in <code>a</code> by <code>imm</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_219">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = a.half[i] - imm;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_219">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsubi_wu-__m256i-a-imm0_31-imm">__m256i __lasx_xvsubi_wu (__m256i a, imm0_31 imm)</h2>
+<h3 id="synopsis_220">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsubi_wu (__m256i a, imm0_31 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsubi.wu xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_220">Description</h3>
+<p>Subtract 32-bit elements in <code>a</code> by <code>imm</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_220">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = a.word[i] - imm;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_220">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsubi_du-__m256i-a-imm0_31-imm">__m256i __lasx_xvsubi_du (__m256i a, imm0_31 imm)</h2>
+<h3 id="synopsis_221">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsubi_du (__m256i a, imm0_31 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsubi.du xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_221">Description</h3>
+<p>Subtract 64-bit elements in <code>a</code> by <code>imm</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_221">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = a.dword[i] - imm;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_221">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsubwev_h_b-__m256i-a-__m256i-b">__m256i __lasx_xvsubwev_h_b (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_222">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsubwev_h_b (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsubwev.h.b xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_222">Description</h3>
+<p>Subtract even-positioned signed 8-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 16-bit result in <code>dst</code>.</p>
+<h3 id="operation_222">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = (s16)(s8)a.byte[2 * i] - (s16)(s8)b.byte[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_222">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsubwev_h_bu-__m256i-a-__m256i-b">__m256i __lasx_xvsubwev_h_bu (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_223">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsubwev_h_bu (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsubwev.h.bu xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_223">Description</h3>
+<p>Subtract even-positioned unsigned 8-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 16-bit result in <code>dst</code>.</p>
+<h3 id="operation_223">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = (u16)(u8)a.byte[2 * i] - (u16)(u8)b.byte[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_223">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsubwev_w_h-__m256i-a-__m256i-b">__m256i __lasx_xvsubwev_w_h (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_224">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsubwev_w_h (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsubwev.w.h xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_224">Description</h3>
+<p>Subtract even-positioned signed 16-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 32-bit result in <code>dst</code>.</p>
+<h3 id="operation_224">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = (s32)(s16)a.half[2 * i] - (s32)(s16)b.half[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_224">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsubwev_w_hu-__m256i-a-__m256i-b">__m256i __lasx_xvsubwev_w_hu (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_225">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsubwev_w_hu (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsubwev.w.hu xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_225">Description</h3>
+<p>Subtract even-positioned unsigned 16-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 32-bit result in <code>dst</code>.</p>
+<h3 id="operation_225">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = (u32)(u16)a.half[2 * i] - (u32)(u16)b.half[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_225">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsubwev_d_w-__m256i-a-__m256i-b">__m256i __lasx_xvsubwev_d_w (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_226">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsubwev_d_w (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsubwev.d.w xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_226">Description</h3>
+<p>Subtract even-positioned signed 32-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 64-bit result in <code>dst</code>.</p>
+<h3 id="operation_226">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = (s64)(s32)a.word[2 * i] - (s64)(s32)b.word[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_226">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsubwev_d_wu-__m256i-a-__m256i-b">__m256i __lasx_xvsubwev_d_wu (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_227">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsubwev_d_wu (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsubwev.d.wu xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_227">Description</h3>
+<p>Subtract even-positioned unsigned 32-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 64-bit result in <code>dst</code>.</p>
+<h3 id="operation_227">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = (u64)(u32)a.word[2 * i] - (u64)(u32)b.word[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_227">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsubwev_q_d-__m256i-a-__m256i-b">__m256i __lasx_xvsubwev_q_d (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_228">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsubwev_q_d (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsubwev.q.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_228">Description</h3>
+<p>Subtract even-positioned signed 64-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 128-bit result in <code>dst</code>.</p>
+<h3 id="operation_228">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.qword[i] = (s128)(s64)a.dword[2 * i] - (s128)(s64)b.dword[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_228">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsubwev_q_du-__m256i-a-__m256i-b">__m256i __lasx_xvsubwev_q_du (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_229">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsubwev_q_du (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsubwev.q.du xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_229">Description</h3>
+<p>Subtract even-positioned unsigned 64-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 128-bit result in <code>dst</code>.</p>
+<h3 id="operation_229">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[2 * i] - (u128)(u64)b.dword[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_229">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsubwod_h_b-__m256i-a-__m256i-b">__m256i __lasx_xvsubwod_h_b (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_230">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsubwod_h_b (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsubwod.h.b xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_230">Description</h3>
+<p>Subtract odd-positioned signed 8-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 16-bit result in <code>dst</code>.</p>
+<h3 id="operation_230">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = (s16)(s8)a.byte[2 * i + 1] - (s16)(s8)b.byte[2 * i + 1];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_230">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsubwod_h_bu-__m256i-a-__m256i-b">__m256i __lasx_xvsubwod_h_bu (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_231">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsubwod_h_bu (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsubwod.h.bu xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_231">Description</h3>
+<p>Subtract odd-positioned unsigned 8-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 16-bit result in <code>dst</code>.</p>
+<h3 id="operation_231">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = (u16)(u8)a.byte[2 * i + 1] - (u16)(u8)b.byte[2 * i + 1];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_231">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsubwod_w_h-__m256i-a-__m256i-b">__m256i __lasx_xvsubwod_w_h (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_232">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsubwod_w_h (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsubwod.w.h xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_232">Description</h3>
+<p>Subtract odd-positioned signed 16-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 32-bit result in <code>dst</code>.</p>
+<h3 id="operation_232">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = (s32)(s16)a.half[2 * i + 1] - (s32)(s16)b.half[2 * i + 1];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_232">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsubwod_w_hu-__m256i-a-__m256i-b">__m256i __lasx_xvsubwod_w_hu (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_233">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsubwod_w_hu (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsubwod.w.hu xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_233">Description</h3>
+<p>Subtract odd-positioned unsigned 16-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 32-bit result in <code>dst</code>.</p>
+<h3 id="operation_233">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = (u32)(u16)a.half[2 * i + 1] - (u32)(u16)b.half[2 * i + 1];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_233">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsubwod_d_w-__m256i-a-__m256i-b">__m256i __lasx_xvsubwod_d_w (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_234">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsubwod_d_w (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsubwod.d.w xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_234">Description</h3>
+<p>Subtract odd-positioned signed 32-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 64-bit result in <code>dst</code>.</p>
+<h3 id="operation_234">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = (s64)(s32)a.word[2 * i + 1] - (s64)(s32)b.word[2 * i + 1];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_234">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsubwod_d_wu-__m256i-a-__m256i-b">__m256i __lasx_xvsubwod_d_wu (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_235">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsubwod_d_wu (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsubwod.d.wu xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_235">Description</h3>
+<p>Subtract odd-positioned unsigned 32-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 64-bit result in <code>dst</code>.</p>
+<h3 id="operation_235">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = (u64)(u32)a.word[2 * i + 1] - (u64)(u32)b.word[2 * i + 1];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_235">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsubwod_q_d-__m256i-a-__m256i-b">__m256i __lasx_xvsubwod_q_d (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_236">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsubwod_q_d (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsubwod.q.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_236">Description</h3>
+<p>Subtract odd-positioned signed 64-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 128-bit result in <code>dst</code>.</p>
+<h3 id="operation_236">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.qword[i] = (s128)(s64)a.dword[2 * i + 1] - (s128)(s64)b.dword[2 * i + 1];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_236">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsubwod_q_du-__m256i-a-__m256i-b">__m256i __lasx_xvsubwod_q_du (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_237">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsubwod_q_du (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsubwod.q.du xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_237">Description</h3>
+<p>Subtract odd-positioned unsigned 64-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 128-bit result in <code>dst</code>.</p>
+<h3 id="operation_237">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] - (u128)(u64)b.dword[2 * i + 1];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_237">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+              
+            </div>
+          </div><footer>
+    <div class="rst-footer-buttons" role="navigation" aria-label="Footer Navigation">
+        <a href="../integer_comparison/" class="btn btn-neutral float-left" title="Integer Comparison"><span class="icon icon-circle-arrow-left"></span> Previous</a>
+        <a href="../logical/" class="btn btn-neutral float-right" title="Logical">Next <span class="icon icon-circle-arrow-right"></span></a>
+    </div>
+
+  <hr/>
+
+  <div role="contentinfo">
+    <!-- Copyright etc -->
+      <p>Copyright &copy; 2023 Jiajie Chen</p>
+  </div>
+
+  Built with <a href="https://www.mkdocs.org/">MkDocs</a> using a <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
+</footer>
+          
+        </div>
+      </div>
+
+    </section>
+
+  </div>
+
+  <div class="rst-versions" role="note" aria-label="Versions">
+  <span class="rst-current-version" data-toggle="rst-current-version">
+    
+    
+      <span><a href="../integer_comparison/" style="color: #fcfcfc">&laquo; Previous</a></span>
+    
+    
+      <span><a href="../logical/" style="color: #fcfcfc">Next &raquo;</a></span>
+    
+  </span>
+</div>
+    <script src="../../js/jquery-3.6.0.min.js"></script>
+    <script>var base_url = "../..";</script>
+    <script src="../../js/theme_extra.js"></script>
+    <script src="../../js/theme.js"></script>
+    <script>
+        jQuery(function () {
+            SphinxRtdTheme.Navigation.enable(true);
+        });
+    </script>
+
+</body>
+</html>
diff --git a/lasx/logical/index.html b/lasx/logical/index.html
new file mode 100644
index 00000000..4f8026c4
--- /dev/null
+++ b/lasx/logical/index.html
@@ -0,0 +1,689 @@
+<!DOCTYPE html>
+<html class="writer-html5" lang="en" >
+<head>
+    <meta charset="utf-8" />
+    <meta http-equiv="X-UA-Compatible" content="IE=edge" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" /><link rel="canonical" href="https://jia.je/unofficial-loongarch-intrinsics-guide/lasx/logical/" />
+      <link rel="shortcut icon" href="../../img/favicon.ico" />
+    <title>Logical - Unofficial LoongArch Intrinsics Guide</title>
+    <link rel="stylesheet" href="../../css/theme.css" />
+    <link rel="stylesheet" href="../../css/theme_extra.css" />
+        <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.8.0/styles/github.min.css" />
+        <link href="../../main.css" rel="stylesheet" />
+    
+      <script>
+        // Current page data
+        var mkdocs_page_name = "Logical";
+        var mkdocs_page_input_path = "lasx/logical.md";
+        var mkdocs_page_url = "/unofficial-loongarch-intrinsics-guide/lasx/logical/";
+      </script>
+    
+    <!--[if lt IE 9]>
+      <script src="../../js/html5shiv.min.js"></script>
+    <![endif]-->
+      <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.8.0/highlight.min.js"></script>
+      <script>hljs.highlightAll();</script> 
+</head>
+
+<body class="wy-body-for-nav" role="document">
+
+  <div class="wy-grid-for-nav">
+    <nav data-toggle="wy-nav-shift" class="wy-nav-side stickynav">
+    <div class="wy-side-scroll">
+      <div class="wy-side-nav-search">
+          <a href="../.." class="icon icon-home"> Unofficial LoongArch Intrinsics Guide
+        </a>
+      </div>
+
+      <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../..">Unofficial LoongArch Intrinsics Guide</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../latency_throughput/">Latency and Throughput of Instructions</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../migrating_avx/">Migrating from AVX to LASX</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../migrating_sse/">Migrating from SSE to LSX</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../viewer/">Browse All Intrinsics</a>
+                </li>
+              </ul>
+              <p class="caption"><span class="caption-text">Lasx</span></p>
+              <ul class="current">
+                  <li class="toctree-l1"><a class="reference internal" href="../bitwise_operations/">Bitwise Operations</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../branch/">Branch</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_comparison/">Floating Point Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_computation/">Floating Point Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_conversion/">Floating Point Conversion</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_misc/">Floating Point Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../fma/">Fused Multiply-Add</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../integer_comparison/">Integer Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../integer_computation/">Integer Computation</a>
+                  </li>
+                  <li class="toctree-l1 current"><a class="reference internal current" href="./">Logical</a>
+    <ul class="current">
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvand_v-__m256i-a-__m256i-b">__m256i __lasx_xvand_v (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvandi_b-__m256i-a-imm0_255-imm">__m256i __lasx_xvandi_b (__m256i a, imm0_255 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_1">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_1">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_1">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_1">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvandn_v-__m256i-a-__m256i-b">__m256i __lasx_xvandn_v (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_2">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_2">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_2">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_2">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvnor_v-__m256i-a-__m256i-b">__m256i __lasx_xvnor_v (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_3">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_3">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_3">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_3">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvnori_b-__m256i-a-imm0_255-imm">__m256i __lasx_xvnori_b (__m256i a, imm0_255 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_4">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_4">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_4">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_4">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvor_v-__m256i-a-__m256i-b">__m256i __lasx_xvor_v (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_5">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_5">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_5">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_5">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvori_b-__m256i-a-imm0_255-imm">__m256i __lasx_xvori_b (__m256i a, imm0_255 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_6">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_6">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_6">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_6">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvorn_v-__m256i-a-__m256i-b">__m256i __lasx_xvorn_v (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_7">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_7">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_7">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_7">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvxor_v-__m256i-a-__m256i-b">__m256i __lasx_xvxor_v (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_8">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_8">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_8">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_8">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvxori_b-__m256i-a-imm0_255-imm">__m256i __lasx_xvxori_b (__m256i a, imm0_255 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_9">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_9">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_9">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_9">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    </ul>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../memory/">Memory Load & Store</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../misc/">Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../permutation/">Permutation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../shift/">Shift</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../shuffling/">Shuffling</a>
+                  </li>
+              </ul>
+              <p class="caption"><span class="caption-text">Lsx</span></p>
+              <ul>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/bitwise_operations/">Bitwise Operations</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/branch/">Branch</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/float_comparison/">Floating Point Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/float_computation/">Floating Point Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/float_conversion/">Floating Point Conversion</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/float_misc/">Floating Point Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/fma/">Fused Multiply-Add</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/integer_comparison/">Integer Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/integer_computation/">Integer Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/logical/">Logical</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/memory/">Memory Load & Store</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/misc/">Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/permutation/">Permutation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/shift/">Shift</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/shuffling/">Shuffling</a>
+                  </li>
+              </ul>
+      </div>
+    </div>
+    </nav>
+
+    <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
+      <nav class="wy-nav-top" role="navigation" aria-label="Mobile navigation menu">
+          <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
+          <a href="../..">Unofficial LoongArch Intrinsics Guide</a>
+        
+      </nav>
+      <div class="wy-nav-content">
+        <div class="rst-content"><div role="navigation" aria-label="breadcrumbs navigation">
+  <ul class="wy-breadcrumbs">
+    <li><a href="../.." class="icon icon-home" aria-label="Docs"></a></li>
+          <li class="breadcrumb-item">Lasx</li>
+      <li class="breadcrumb-item active">Logical</li>
+    <li class="wy-breadcrumbs-aside">
+    </li>
+  </ul>
+  <hr/>
+</div>
+          <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
+            <div class="section" itemprop="articleBody">
+              
+                <h1 id="logical">Logical</h1>
+<h2 id="__m256i-__lasx_xvand_v-__m256i-a-__m256i-b">__m256i __lasx_xvand_v (__m256i a, __m256i b)</h2>
+<h3 id="synopsis">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvand_v (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvand.v xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description">Description</h3>
+<p>Compute bitwise AND between elements in <code>a</code> and <code>b</code>.</p>
+<h3 id="operation">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = a.dword[i] &amp; b.dword[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvandi_b-__m256i-a-imm0_255-imm">__m256i __lasx_xvandi_b (__m256i a, imm0_255 imm)</h2>
+<h3 id="synopsis_1">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvandi_b (__m256i a, imm0_255 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvandi.b xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_1">Description</h3>
+<p>Compute bitwise AND between elements in <code>a</code> and <code>imm</code>.</p>
+<h3 id="operation_1">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 32; i++) {
+  dst.byte[i] = a.byte[i] &amp; imm;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_1">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvandn_v-__m256i-a-__m256i-b">__m256i __lasx_xvandn_v (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_2">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvandn_v (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvandn.v xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_2">Description</h3>
+<p>Compute bitwise ANDN between elements in <code>a</code> and <code>b</code>.</p>
+<h3 id="operation_2">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = b.dword[i] &amp; (~a.dword[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_2">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvnor_v-__m256i-a-__m256i-b">__m256i __lasx_xvnor_v (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_3">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvnor_v (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvnor.v xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_3">Description</h3>
+<p>Compute bitwise NOR between elements in <code>a</code> and <code>b</code>.</p>
+<h3 id="operation_3">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = ~(a.dword[i] | b.dword[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_3">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvnori_b-__m256i-a-imm0_255-imm">__m256i __lasx_xvnori_b (__m256i a, imm0_255 imm)</h2>
+<h3 id="synopsis_4">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvnori_b (__m256i a, imm0_255 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvnori.b xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_4">Description</h3>
+<p>Compute bitwise NOR between elements in <code>a</code> and <code>imm</code>.</p>
+<h3 id="operation_4">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 32; i++) {
+  dst.byte[i] = ~(a.byte[i] | imm);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_4">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvor_v-__m256i-a-__m256i-b">__m256i __lasx_xvor_v (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_5">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvor_v (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvor.v xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_5">Description</h3>
+<p>Compute bitwise OR between elements in <code>a</code> and <code>b</code>.</p>
+<h3 id="operation_5">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = a.dword[i] | b.dword[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_5">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvori_b-__m256i-a-imm0_255-imm">__m256i __lasx_xvori_b (__m256i a, imm0_255 imm)</h2>
+<h3 id="synopsis_6">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvori_b (__m256i a, imm0_255 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvori.b xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_6">Description</h3>
+<p>Compute bitwise OR between elements in <code>a</code> and <code>imm</code>.</p>
+<h3 id="operation_6">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 32; i++) {
+  dst.byte[i] = a.byte[i] | imm;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_6">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvorn_v-__m256i-a-__m256i-b">__m256i __lasx_xvorn_v (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_7">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvorn_v (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvorn.v xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_7">Description</h3>
+<p>Compute bitwise ORN between elements in <code>a</code> and <code>b</code>.</p>
+<h3 id="operation_7">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = a.dword[i] | (~b.dword[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_7">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvxor_v-__m256i-a-__m256i-b">__m256i __lasx_xvxor_v (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_8">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvxor_v (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvxor.v xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_8">Description</h3>
+<p>Compute bitwise XOR between elements in <code>a</code> and <code>b</code>.</p>
+<h3 id="operation_8">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = a.dword[i] ^ b.dword[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_8">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvxori_b-__m256i-a-imm0_255-imm">__m256i __lasx_xvxori_b (__m256i a, imm0_255 imm)</h2>
+<h3 id="synopsis_9">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvxori_b (__m256i a, imm0_255 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvxori.b xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_9">Description</h3>
+<p>Compute bitwise XOR between elements in <code>a</code> and <code>imm</code>.</p>
+<h3 id="operation_9">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 32; i++) {
+  dst.byte[i] = a.byte[i] ^ imm;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_9">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+              
+            </div>
+          </div><footer>
+    <div class="rst-footer-buttons" role="navigation" aria-label="Footer Navigation">
+        <a href="../integer_computation/" class="btn btn-neutral float-left" title="Integer Computation"><span class="icon icon-circle-arrow-left"></span> Previous</a>
+        <a href="../memory/" class="btn btn-neutral float-right" title="Memory Load & Store">Next <span class="icon icon-circle-arrow-right"></span></a>
+    </div>
+
+  <hr/>
+
+  <div role="contentinfo">
+    <!-- Copyright etc -->
+      <p>Copyright &copy; 2023 Jiajie Chen</p>
+  </div>
+
+  Built with <a href="https://www.mkdocs.org/">MkDocs</a> using a <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
+</footer>
+          
+        </div>
+      </div>
+
+    </section>
+
+  </div>
+
+  <div class="rst-versions" role="note" aria-label="Versions">
+  <span class="rst-current-version" data-toggle="rst-current-version">
+    
+    
+      <span><a href="../integer_computation/" style="color: #fcfcfc">&laquo; Previous</a></span>
+    
+    
+      <span><a href="../memory/" style="color: #fcfcfc">Next &raquo;</a></span>
+    
+  </span>
+</div>
+    <script src="../../js/jquery-3.6.0.min.js"></script>
+    <script>var base_url = "../..";</script>
+    <script src="../../js/theme_extra.js"></script>
+    <script src="../../js/theme.js"></script>
+    <script>
+        jQuery(function () {
+            SphinxRtdTheme.Navigation.enable(true);
+        });
+    </script>
+
+</body>
+</html>
diff --git a/lasx/memory/index.html b/lasx/memory/index.html
new file mode 100644
index 00000000..f8135271
--- /dev/null
+++ b/lasx/memory/index.html
@@ -0,0 +1,475 @@
+<!DOCTYPE html>
+<html class="writer-html5" lang="en" >
+<head>
+    <meta charset="utf-8" />
+    <meta http-equiv="X-UA-Compatible" content="IE=edge" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" /><link rel="canonical" href="https://jia.je/unofficial-loongarch-intrinsics-guide/lasx/memory/" />
+      <link rel="shortcut icon" href="../../img/favicon.ico" />
+    <title>Memory Load & Store - Unofficial LoongArch Intrinsics Guide</title>
+    <link rel="stylesheet" href="../../css/theme.css" />
+    <link rel="stylesheet" href="../../css/theme_extra.css" />
+        <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.8.0/styles/github.min.css" />
+        <link href="../../main.css" rel="stylesheet" />
+    
+      <script>
+        // Current page data
+        var mkdocs_page_name = "Memory Load \u0026 Store";
+        var mkdocs_page_input_path = "lasx/memory.md";
+        var mkdocs_page_url = "/unofficial-loongarch-intrinsics-guide/lasx/memory/";
+      </script>
+    
+    <!--[if lt IE 9]>
+      <script src="../../js/html5shiv.min.js"></script>
+    <![endif]-->
+      <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.8.0/highlight.min.js"></script>
+      <script>hljs.highlightAll();</script> 
+</head>
+
+<body class="wy-body-for-nav" role="document">
+
+  <div class="wy-grid-for-nav">
+    <nav data-toggle="wy-nav-shift" class="wy-nav-side stickynav">
+    <div class="wy-side-scroll">
+      <div class="wy-side-nav-search">
+          <a href="../.." class="icon icon-home"> Unofficial LoongArch Intrinsics Guide
+        </a>
+      </div>
+
+      <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../..">Unofficial LoongArch Intrinsics Guide</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../latency_throughput/">Latency and Throughput of Instructions</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../migrating_avx/">Migrating from AVX to LASX</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../migrating_sse/">Migrating from SSE to LSX</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../viewer/">Browse All Intrinsics</a>
+                </li>
+              </ul>
+              <p class="caption"><span class="caption-text">Lasx</span></p>
+              <ul class="current">
+                  <li class="toctree-l1"><a class="reference internal" href="../bitwise_operations/">Bitwise Operations</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../branch/">Branch</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_comparison/">Floating Point Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_computation/">Floating Point Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_conversion/">Floating Point Conversion</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_misc/">Floating Point Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../fma/">Fused Multiply-Add</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../integer_comparison/">Integer Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../integer_computation/">Integer Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../logical/">Logical</a>
+                  </li>
+                  <li class="toctree-l1 current"><a class="reference internal current" href="./">Memory Load & Store</a>
+    <ul class="current">
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvld-void-addr-imm_n2048_2047-offset">__m256i __lasx_xvld (void * addr, imm_n2048_2047 offset)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation">Operation</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvldx-void-addr-long-int-offset">__m256i __lasx_xvldx (void * addr, long int offset)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_1">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_1">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_1">Operation</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvldrepl_b-void-addr-imm_n2048_2047-offset">__m256i __lasx_xvldrepl_b (void * addr, imm_n2048_2047 offset)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_2">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_2">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_2">Operation</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvldrepl_h-void-addr-imm_n1024_1023-offset">__m256i __lasx_xvldrepl_h (void * addr, imm_n1024_1023 offset)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_3">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_3">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_3">Operation</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvldrepl_w-void-addr-imm_n512_511-offset">__m256i __lasx_xvldrepl_w (void * addr, imm_n512_511 offset)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_4">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_4">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_4">Operation</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvldrepl_d-void-addr-imm_n256_255-offset">__m256i __lasx_xvldrepl_d (void * addr, imm_n256_255 offset)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_5">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_5">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_5">Operation</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#void-__lasx_xvst-__m256i-data-void-addr-imm_n2048_2047-offset">void __lasx_xvst (__m256i data, void * addr, imm_n2048_2047 offset)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_6">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_6">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_6">Operation</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#void-__lasx_xvstx-__m256i-data-void-addr-long-int-offset">void __lasx_xvstx (__m256i data, void * addr, long int offset)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_7">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_7">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_7">Operation</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#void-__lasx_xvstelm_b-__m256i-data-void-addr-imm_n128_127-offset-imm0_31-lane">void __lasx_xvstelm_b (__m256i data, void * addr, imm_n128_127 offset, imm0_31 lane)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_8">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_8">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_8">Operation</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#void-__lasx_xvstelm_h-__m256i-data-void-addr-imm_n128_127-offset-imm0_15-lane">void __lasx_xvstelm_h (__m256i data, void * addr, imm_n128_127 offset, imm0_15 lane)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_9">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_9">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_9">Operation</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#void-__lasx_xvstelm_w-__m256i-data-void-addr-imm_n128_127-offset-imm0_7-lane">void __lasx_xvstelm_w (__m256i data, void * addr, imm_n128_127 offset, imm0_7 lane)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_10">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_10">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_10">Operation</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#void-__lasx_xvstelm_d-__m256i-data-void-addr-imm_n128_127-offset-imm0_3-lane">void __lasx_xvstelm_d (__m256i data, void * addr, imm_n128_127 offset, imm0_3 lane)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_11">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_11">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_11">Operation</a>
+    </li>
+        </ul>
+    </li>
+    </ul>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../misc/">Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../permutation/">Permutation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../shift/">Shift</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../shuffling/">Shuffling</a>
+                  </li>
+              </ul>
+              <p class="caption"><span class="caption-text">Lsx</span></p>
+              <ul>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/bitwise_operations/">Bitwise Operations</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/branch/">Branch</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/float_comparison/">Floating Point Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/float_computation/">Floating Point Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/float_conversion/">Floating Point Conversion</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/float_misc/">Floating Point Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/fma/">Fused Multiply-Add</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/integer_comparison/">Integer Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/integer_computation/">Integer Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/logical/">Logical</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/memory/">Memory Load & Store</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/misc/">Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/permutation/">Permutation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/shift/">Shift</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/shuffling/">Shuffling</a>
+                  </li>
+              </ul>
+      </div>
+    </div>
+    </nav>
+
+    <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
+      <nav class="wy-nav-top" role="navigation" aria-label="Mobile navigation menu">
+          <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
+          <a href="../..">Unofficial LoongArch Intrinsics Guide</a>
+        
+      </nav>
+      <div class="wy-nav-content">
+        <div class="rst-content"><div role="navigation" aria-label="breadcrumbs navigation">
+  <ul class="wy-breadcrumbs">
+    <li><a href="../.." class="icon icon-home" aria-label="Docs"></a></li>
+          <li class="breadcrumb-item">Lasx</li>
+      <li class="breadcrumb-item active">Memory Load & Store</li>
+    <li class="wy-breadcrumbs-aside">
+    </li>
+  </ul>
+  <hr/>
+</div>
+          <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
+            <div class="section" itemprop="articleBody">
+              
+                <h1 id="memory-load-store">Memory Load &amp; Store</h1>
+<h2 id="__m256i-__lasx_xvld-void-addr-imm_n2048_2047-offset">__m256i __lasx_xvld (void * addr, imm_n2048_2047 offset)</h2>
+<h3 id="synopsis">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvld (void * addr, imm_n2048_2047 offset)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvld xr, r, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description">Description</h3>
+<p>Read whole vector from memory address <code>addr + offset</code>, save the data into <code>dst</code>. Note that you can use this intrinsic to load floating point vectors, even though the return type represents integer vectors.</p>
+<h3 id="operation">Operation</h3>
+<pre><code class="language-c++">dst = memory_load(256, addr + offset);
+</code></pre>
+<h2 id="__m256i-__lasx_xvldx-void-addr-long-int-offset">__m256i __lasx_xvldx (void * addr, long int offset)</h2>
+<h3 id="synopsis_1">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvldx (void * addr, long int offset)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvldx xr, r, r
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_1">Description</h3>
+<p>Read whole vector from memory address <code>addr + offset</code>, save the data into <code>dst</code>.  Note that you can use this intrinsic to load floating point vectors, even though the return type represents integer vectors.</p>
+<h3 id="operation_1">Operation</h3>
+<pre><code class="language-c++">dst = memory_load(256, addr + offset);
+</code></pre>
+<h2 id="__m256i-__lasx_xvldrepl_b-void-addr-imm_n2048_2047-offset">__m256i __lasx_xvldrepl_b (void * addr, imm_n2048_2047 offset)</h2>
+<h3 id="synopsis_2">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvldrepl_b (void * addr, imm_n2048_2047 offset)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvldrepl.b xr, r, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_2">Description</h3>
+<p>Read 8-bit data from memory address <code>addr + (offset &lt;&lt; 0)</code>, replicate the data to all vector lanes and save into <code>dst</code>.</p>
+<h3 id="operation_2">Operation</h3>
+<pre><code class="language-c++">u8 data = memory_load(8, addr + offset);
+for (int i = 0; i &lt; 32; i++) {
+  dst.byte[i] = data;
+}
+</code></pre>
+<h2 id="__m256i-__lasx_xvldrepl_h-void-addr-imm_n1024_1023-offset">__m256i __lasx_xvldrepl_h (void * addr, imm_n1024_1023 offset)</h2>
+<h3 id="synopsis_3">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvldrepl_h (void * addr, imm_n1024_1023 offset)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvldrepl.h xr, r, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_3">Description</h3>
+<p>Read 16-bit data from memory address <code>addr + (offset &lt;&lt; 1)</code>, replicate the data to all vector lanes and save into <code>dst</code>.</p>
+<h3 id="operation_3">Operation</h3>
+<pre><code class="language-c++">u16 data = memory_load(16, addr + (offset &lt;&lt; 1));
+for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = data;
+}
+</code></pre>
+<h2 id="__m256i-__lasx_xvldrepl_w-void-addr-imm_n512_511-offset">__m256i __lasx_xvldrepl_w (void * addr, imm_n512_511 offset)</h2>
+<h3 id="synopsis_4">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvldrepl_w (void * addr, imm_n512_511 offset)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvldrepl.w xr, r, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_4">Description</h3>
+<p>Read 32-bit data from memory address <code>addr + (offset &lt;&lt; 2)</code>, replicate the data to all vector lanes and save into <code>dst</code>.</p>
+<h3 id="operation_4">Operation</h3>
+<pre><code class="language-c++">u32 data = memory_load(32, addr + (offset &lt;&lt; 2));
+for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = data;
+}
+</code></pre>
+<h2 id="__m256i-__lasx_xvldrepl_d-void-addr-imm_n256_255-offset">__m256i __lasx_xvldrepl_d (void * addr, imm_n256_255 offset)</h2>
+<h3 id="synopsis_5">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvldrepl_d (void * addr, imm_n256_255 offset)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvldrepl.d xr, r, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_5">Description</h3>
+<p>Read 64-bit data from memory address <code>addr + (offset &lt;&lt; 3)</code>, replicate the data to all vector lanes and save into <code>dst</code>.</p>
+<h3 id="operation_5">Operation</h3>
+<pre><code class="language-c++">u64 data = memory_load(64, addr + (offset &lt;&lt; 3));
+for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = data;
+}
+</code></pre>
+<h2 id="void-__lasx_xvst-__m256i-data-void-addr-imm_n2048_2047-offset">void __lasx_xvst (__m256i data, void * addr, imm_n2048_2047 offset)</h2>
+<h3 id="synopsis_6">Synopsis</h3>
+<pre><code class="language-c++">void __lasx_xvst (__m256i data, void * addr, imm_n2048_2047 offset)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvst xr, r, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_6">Description</h3>
+<p>Write whole vector data in <code>data</code> to memory address <code>addr + offset</code>.</p>
+<h3 id="operation_6">Operation</h3>
+<pre><code class="language-c++">memory_store(256, data, addr + offset);
+</code></pre>
+<h2 id="void-__lasx_xvstx-__m256i-data-void-addr-long-int-offset">void __lasx_xvstx (__m256i data, void * addr, long int offset)</h2>
+<h3 id="synopsis_7">Synopsis</h3>
+<pre><code class="language-c++">void __lasx_xvstx (__m256i data, void * addr, long int offset)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvstx xr, r, r
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_7">Description</h3>
+<p>Write whole-vector data in <code>data</code> to memory address <code>addr + offset</code>.</p>
+<h3 id="operation_7">Operation</h3>
+<pre><code class="language-c++">memory_store(256, data, addr + offset);
+</code></pre>
+<h2 id="void-__lasx_xvstelm_b-__m256i-data-void-addr-imm_n128_127-offset-imm0_31-lane">void __lasx_xvstelm_b (__m256i data, void * addr, imm_n128_127 offset, imm0_31 lane)</h2>
+<h3 id="synopsis_8">Synopsis</h3>
+<pre><code class="language-c++">void __lasx_xvstelm_b (__m256i data, void * addr, imm_n128_127 offset, imm0_31 lane)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvstelm.b xr, r, imm, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_8">Description</h3>
+<p>Store the 8-bit element in <code>data</code> specified by <code>lane</code> to memory address <code>addr + offset</code>.</p>
+<h3 id="operation_8">Operation</h3>
+<pre><code class="language-c++">memory_store(8, data.byte[lane], addr + offset);
+</code></pre>
+<h2 id="void-__lasx_xvstelm_h-__m256i-data-void-addr-imm_n128_127-offset-imm0_15-lane">void __lasx_xvstelm_h (__m256i data, void * addr, imm_n128_127 offset, imm0_15 lane)</h2>
+<h3 id="synopsis_9">Synopsis</h3>
+<pre><code class="language-c++">void __lasx_xvstelm_h (__m256i data, void * addr, imm_n128_127 offset, imm0_15 lane)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvstelm.h xr, r, imm, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_9">Description</h3>
+<p>Store the 16-bit element in <code>data</code> specified by <code>lane</code> to memory address <code>addr + offset</code>.</p>
+<h3 id="operation_9">Operation</h3>
+<pre><code class="language-c++">memory_store(16, data.half[lane], addr + offset);
+</code></pre>
+<h2 id="void-__lasx_xvstelm_w-__m256i-data-void-addr-imm_n128_127-offset-imm0_7-lane">void __lasx_xvstelm_w (__m256i data, void * addr, imm_n128_127 offset, imm0_7 lane)</h2>
+<h3 id="synopsis_10">Synopsis</h3>
+<pre><code class="language-c++">void __lasx_xvstelm_w (__m256i data, void * addr, imm_n128_127 offset, imm0_7 lane)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvstelm.w xr, r, imm, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_10">Description</h3>
+<p>Store the 32-bit element in <code>data</code> specified by <code>lane</code> to memory address <code>addr + offset</code>.</p>
+<h3 id="operation_10">Operation</h3>
+<pre><code class="language-c++">memory_store(32, data.word[lane], addr + offset);
+</code></pre>
+<h2 id="void-__lasx_xvstelm_d-__m256i-data-void-addr-imm_n128_127-offset-imm0_3-lane">void __lasx_xvstelm_d (__m256i data, void * addr, imm_n128_127 offset, imm0_3 lane)</h2>
+<h3 id="synopsis_11">Synopsis</h3>
+<pre><code class="language-c++">void __lasx_xvstelm_d (__m256i data, void * addr, imm_n128_127 offset, imm0_3 lane)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvstelm.d xr, r, imm, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_11">Description</h3>
+<p>Store the 64-bit element in <code>data</code> specified by <code>lane</code> to memory address <code>addr + offset</code>.</p>
+<h3 id="operation_11">Operation</h3>
+<pre><code class="language-c++">memory_store(64, data.dword[lane], addr + offset);
+</code></pre>
+              
+            </div>
+          </div><footer>
+    <div class="rst-footer-buttons" role="navigation" aria-label="Footer Navigation">
+        <a href="../logical/" class="btn btn-neutral float-left" title="Logical"><span class="icon icon-circle-arrow-left"></span> Previous</a>
+        <a href="../misc/" class="btn btn-neutral float-right" title="Misc">Next <span class="icon icon-circle-arrow-right"></span></a>
+    </div>
+
+  <hr/>
+
+  <div role="contentinfo">
+    <!-- Copyright etc -->
+      <p>Copyright &copy; 2023 Jiajie Chen</p>
+  </div>
+
+  Built with <a href="https://www.mkdocs.org/">MkDocs</a> using a <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
+</footer>
+          
+        </div>
+      </div>
+
+    </section>
+
+  </div>
+
+  <div class="rst-versions" role="note" aria-label="Versions">
+  <span class="rst-current-version" data-toggle="rst-current-version">
+    
+    
+      <span><a href="../logical/" style="color: #fcfcfc">&laquo; Previous</a></span>
+    
+    
+      <span><a href="../misc/" style="color: #fcfcfc">Next &raquo;</a></span>
+    
+  </span>
+</div>
+    <script src="../../js/jquery-3.6.0.min.js"></script>
+    <script>var base_url = "../..";</script>
+    <script src="../../js/theme_extra.js"></script>
+    <script src="../../js/theme.js"></script>
+    <script>
+        jQuery(function () {
+            SphinxRtdTheme.Navigation.enable(true);
+        });
+    </script>
+
+</body>
+</html>
diff --git a/lasx/misc/index.html b/lasx/misc/index.html
new file mode 100644
index 00000000..14bdfcd3
--- /dev/null
+++ b/lasx/misc/index.html
@@ -0,0 +1,5745 @@
+<!DOCTYPE html>
+<html class="writer-html5" lang="en" >
+<head>
+    <meta charset="utf-8" />
+    <meta http-equiv="X-UA-Compatible" content="IE=edge" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" /><link rel="canonical" href="https://jia.je/unofficial-loongarch-intrinsics-guide/lasx/misc/" />
+      <link rel="shortcut icon" href="../../img/favicon.ico" />
+    <title>Misc - Unofficial LoongArch Intrinsics Guide</title>
+    <link rel="stylesheet" href="../../css/theme.css" />
+    <link rel="stylesheet" href="../../css/theme_extra.css" />
+        <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.8.0/styles/github.min.css" />
+        <link href="../../main.css" rel="stylesheet" />
+    
+      <script>
+        // Current page data
+        var mkdocs_page_name = "Misc";
+        var mkdocs_page_input_path = "lasx/misc.md";
+        var mkdocs_page_url = "/unofficial-loongarch-intrinsics-guide/lasx/misc/";
+      </script>
+    
+    <!--[if lt IE 9]>
+      <script src="../../js/html5shiv.min.js"></script>
+    <![endif]-->
+      <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.8.0/highlight.min.js"></script>
+      <script>hljs.highlightAll();</script> 
+</head>
+
+<body class="wy-body-for-nav" role="document">
+
+  <div class="wy-grid-for-nav">
+    <nav data-toggle="wy-nav-shift" class="wy-nav-side stickynav">
+    <div class="wy-side-scroll">
+      <div class="wy-side-nav-search">
+          <a href="../.." class="icon icon-home"> Unofficial LoongArch Intrinsics Guide
+        </a>
+      </div>
+
+      <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../..">Unofficial LoongArch Intrinsics Guide</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../latency_throughput/">Latency and Throughput of Instructions</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../migrating_avx/">Migrating from AVX to LASX</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../migrating_sse/">Migrating from SSE to LSX</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../viewer/">Browse All Intrinsics</a>
+                </li>
+              </ul>
+              <p class="caption"><span class="caption-text">Lasx</span></p>
+              <ul class="current">
+                  <li class="toctree-l1"><a class="reference internal" href="../bitwise_operations/">Bitwise Operations</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../branch/">Branch</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_comparison/">Floating Point Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_computation/">Floating Point Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_conversion/">Floating Point Conversion</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_misc/">Floating Point Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../fma/">Fused Multiply-Add</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../integer_comparison/">Integer Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../integer_computation/">Integer Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../logical/">Logical</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../memory/">Memory Load & Store</a>
+                  </li>
+                  <li class="toctree-l1 current"><a class="reference internal current" href="./">Misc</a>
+    <ul class="current">
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvexth_h_b-__m256i-a">__m256i __lasx_xvexth_h_b (__m256i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvexth_hu_bu-__m256i-a">__m256i __lasx_xvexth_hu_bu (__m256i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_1">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_1">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_1">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_1">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvexth_w_h-__m256i-a">__m256i __lasx_xvexth_w_h (__m256i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_2">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_2">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_2">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_2">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvexth_wu_hu-__m256i-a">__m256i __lasx_xvexth_wu_hu (__m256i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_3">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_3">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_3">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_3">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvexth_d_w-__m256i-a">__m256i __lasx_xvexth_d_w (__m256i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_4">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_4">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_4">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_4">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvexth_du_wu-__m256i-a">__m256i __lasx_xvexth_du_wu (__m256i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_5">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_5">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_5">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_5">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvexth_q_d-__m256i-a">__m256i __lasx_xvexth_q_d (__m256i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_6">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_6">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_6">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_6">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvexth_qu_du-__m256i-a">__m256i __lasx_xvexth_qu_du (__m256i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_7">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_7">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_7">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_7">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvextl_q_d-__m256i-a">__m256i __lasx_xvextl_q_d (__m256i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_8">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_8">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_8">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_8">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvextl_qu_du-__m256i-a">__m256i __lasx_xvextl_qu_du (__m256i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_9">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_9">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_9">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_9">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvextrins_b-__m256i-a-__m256i-b-imm0_255-imm">__m256i __lasx_xvextrins_b (__m256i a, __m256i b, imm0_255 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_10">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_10">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_10">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_10">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvextrins_h-__m256i-a-__m256i-b-imm0_255-imm">__m256i __lasx_xvextrins_h (__m256i a, __m256i b, imm0_255 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_11">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_11">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_11">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_11">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvextrins_w-__m256i-a-__m256i-b-imm0_255-imm">__m256i __lasx_xvextrins_w (__m256i a, __m256i b, imm0_255 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_12">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_12">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_12">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_12">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvextrins_d-__m256i-a-__m256i-b-imm0_255-imm">__m256i __lasx_xvextrins_d (__m256i a, __m256i b, imm0_255 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_13">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_13">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_13">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_13">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_vext2xv_h_b-__m256i-a">__m256i __lasx_vext2xv_h_b (__m256i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_14">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_14">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_14">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_14">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_vext2xv_hu_bu-__m256i-a">__m256i __lasx_vext2xv_hu_bu (__m256i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_15">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_15">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_15">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_15">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_vext2xv_w_b-__m256i-a">__m256i __lasx_vext2xv_w_b (__m256i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_16">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_16">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_16">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_16">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_vext2xv_wu_bu-__m256i-a">__m256i __lasx_vext2xv_wu_bu (__m256i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_17">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_17">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_17">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_17">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_vext2xv_w_h-__m256i-a">__m256i __lasx_vext2xv_w_h (__m256i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_18">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_18">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_18">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_18">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_vext2xv_wu_hu-__m256i-a">__m256i __lasx_vext2xv_wu_hu (__m256i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_19">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_19">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_19">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_19">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_vext2xv_d_b-__m256i-a">__m256i __lasx_vext2xv_d_b (__m256i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_20">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_20">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_20">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_20">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_vext2xv_du_bu-__m256i-a">__m256i __lasx_vext2xv_du_bu (__m256i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_21">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_21">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_21">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_21">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_vext2xv_d_h-__m256i-a">__m256i __lasx_vext2xv_d_h (__m256i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_22">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_22">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_22">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_22">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_vext2xv_du_hu-__m256i-a">__m256i __lasx_vext2xv_du_hu (__m256i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_23">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_23">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_23">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_23">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_vext2xv_d_w-__m256i-a">__m256i __lasx_vext2xv_d_w (__m256i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_24">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_24">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_24">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_24">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_vext2xv_du_wu-__m256i-a">__m256i __lasx_vext2xv_du_wu (__m256i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_25">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_25">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_25">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_25">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvilvh_b-__m256i-a-__m256i-b">__m256i __lasx_xvilvh_b (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_26">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_26">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_26">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_26">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvilvh_h-__m256i-a-__m256i-b">__m256i __lasx_xvilvh_h (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_27">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_27">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_27">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_27">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvilvh_w-__m256i-a-__m256i-b">__m256i __lasx_xvilvh_w (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_28">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_28">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_28">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_28">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvilvh_d-__m256i-a-__m256i-b">__m256i __lasx_xvilvh_d (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_29">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_29">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_29">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_29">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvilvl_b-__m256i-a-__m256i-b">__m256i __lasx_xvilvl_b (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_30">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_30">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_30">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_30">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvilvl_h-__m256i-a-__m256i-b">__m256i __lasx_xvilvl_h (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_31">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_31">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_31">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_31">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvilvl_w-__m256i-a-__m256i-b">__m256i __lasx_xvilvl_w (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_32">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_32">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_32">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_32">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvilvl_d-__m256i-a-__m256i-b">__m256i __lasx_xvilvl_d (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_33">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_33">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_33">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_33">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvinsgr2vr_w-__m256i-a-int-b-imm0_7-imm">__m256i __lasx_xvinsgr2vr_w (__m256i a, int b, imm0_7 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_34">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_34">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_34">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_34">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvinsgr2vr_d-__m256i-a-long-int-b-imm0_3-imm">__m256i __lasx_xvinsgr2vr_d (__m256i a, long int b, imm0_3 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_35">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_35">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_35">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_35">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvinsve0_w-__m256i-a-__m256i-b-imm0_7-imm">__m256i __lasx_xvinsve0_w (__m256i a, __m256i b, imm0_7 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_36">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_36">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_36">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_36">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvinsve0_d-__m256i-a-__m256i-b-imm0_3-imm">__m256i __lasx_xvinsve0_d (__m256i a, __m256i b, imm0_3 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_37">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_37">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_37">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_37">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvfrstp_b-__m256i-a-__m256i-b-__m256i-c">__m256i __lasx_xvfrstp_b (__m256i a, __m256i b, __m256i c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_38">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_38">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_38">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_38">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvfrstp_h-__m256i-a-__m256i-b-__m256i-c">__m256i __lasx_xvfrstp_h (__m256i a, __m256i b, __m256i c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_39">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_39">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_39">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_39">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvfrstpi_b-__m256i-a-__m256i-b-imm0_31-imm">__m256i __lasx_xvfrstpi_b (__m256i a, __m256i b, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_40">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_40">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_40">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_40">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvfrstpi_h-__m256i-a-__m256i-b-imm0_31-imm">__m256i __lasx_xvfrstpi_h (__m256i a, __m256i b, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_41">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_41">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_41">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_41">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmskgez_b-__m256i-a">__m256i __lasx_xvmskgez_b (__m256i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_42">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_42">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_42">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_42">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmskltz_b-__m256i-a">__m256i __lasx_xvmskltz_b (__m256i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_43">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_43">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_1">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_43">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_43">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmskltz_h-__m256i-a">__m256i __lasx_xvmskltz_h (__m256i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_44">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_44">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_2">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_44">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_44">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmskltz_w-__m256i-a">__m256i __lasx_xvmskltz_w (__m256i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_45">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_45">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_3">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_45">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_45">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmskltz_d-__m256i-a">__m256i __lasx_xvmskltz_d (__m256i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_46">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_46">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_4">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_46">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_46">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvmsknz_b-__m256i-a">__m256i __lasx_xvmsknz_b (__m256i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_47">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_47">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_5">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_47">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_47">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvpackev_b-__m256i-a-__m256i-b">__m256i __lasx_xvpackev_b (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_48">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_48">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_48">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_48">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvpackev_h-__m256i-a-__m256i-b">__m256i __lasx_xvpackev_h (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_49">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_49">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_49">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_49">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvpackev_w-__m256i-a-__m256i-b">__m256i __lasx_xvpackev_w (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_50">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_50">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_50">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_50">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvpackev_d-__m256i-a-__m256i-b">__m256i __lasx_xvpackev_d (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_51">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_51">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_51">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_51">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvpackod_b-__m256i-a-__m256i-b">__m256i __lasx_xvpackod_b (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_52">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_52">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_52">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_52">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvpackod_h-__m256i-a-__m256i-b">__m256i __lasx_xvpackod_h (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_53">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_53">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_53">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_53">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvpackod_w-__m256i-a-__m256i-b">__m256i __lasx_xvpackod_w (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_54">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_54">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_54">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_54">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvpackod_d-__m256i-a-__m256i-b">__m256i __lasx_xvpackod_d (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_55">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_55">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_55">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_55">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvpickev_b-__m256i-a-__m256i-b">__m256i __lasx_xvpickev_b (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_56">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_56">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_56">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_56">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvpickev_h-__m256i-a-__m256i-b">__m256i __lasx_xvpickev_h (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_57">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_57">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_57">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_57">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvpickev_w-__m256i-a-__m256i-b">__m256i __lasx_xvpickev_w (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_58">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_58">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_58">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_58">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvpickev_d-__m256i-a-__m256i-b">__m256i __lasx_xvpickev_d (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_59">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_59">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_59">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_59">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvpickve_w-__m256i-a-imm0_7-imm">__m256i __lasx_xvpickve_w (__m256i a, imm0_7 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_60">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_60">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_60">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_60">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvpickve_d-__m256i-a-imm0_3-imm">__m256i __lasx_xvpickve_d (__m256i a, imm0_3 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_61">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_61">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_61">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_61">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256-__lasx_xvpickve_w_f-__m256-a-imm0_7-imm">__m256 __lasx_xvpickve_w_f (__m256 a, imm0_7 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_62">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_62">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_62">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_62">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256d-__lasx_xvpickve_d_f-__m256d-a-imm0_3-imm">__m256d __lasx_xvpickve_d_f (__m256d a, imm0_3 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_63">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_63">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_63">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_63">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#int-__lasx_xvpickve2gr_w-__m256i-a-imm0_7-idx">int __lasx_xvpickve2gr_w (__m256i a, imm0_7 idx)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_64">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_64">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_64">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_64">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#unsigned-int-__lasx_xvpickve2gr_wu-__m256i-a-imm0_7-idx">unsigned int __lasx_xvpickve2gr_wu (__m256i a, imm0_7 idx)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_65">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_65">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_65">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_65">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#long-int-__lasx_xvpickve2gr_d-__m256i-a-imm0_3-idx">long int __lasx_xvpickve2gr_d (__m256i a, imm0_3 idx)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_66">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_66">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_66">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_66">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#unsigned-long-int-__lasx_xvpickve2gr_du-__m256i-a-imm0_3-idx">unsigned long int __lasx_xvpickve2gr_du (__m256i a, imm0_3 idx)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_67">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_67">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_67">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_67">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvpickod_b-__m256i-a-__m256i-b">__m256i __lasx_xvpickod_b (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_68">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_68">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_68">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_68">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvpickod_h-__m256i-a-__m256i-b">__m256i __lasx_xvpickod_h (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_69">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_69">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_69">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_69">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvpickod_w-__m256i-a-__m256i-b">__m256i __lasx_xvpickod_w (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_70">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_70">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_70">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_70">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvpickod_d-__m256i-a-__m256i-b">__m256i __lasx_xvpickod_d (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_71">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_71">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_71">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_71">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvrepli_b-imm_n512_511-imm">__m256i __lasx_xvrepli_b (imm_n512_511 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_72">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_72">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_72">Operation</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvrepli_h-imm_n512_511-imm">__m256i __lasx_xvrepli_h (imm_n512_511 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_73">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_73">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_73">Operation</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvrepli_w-imm_n512_511-imm">__m256i __lasx_xvrepli_w (imm_n512_511 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_74">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_74">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_74">Operation</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvrepli_d-imm_n512_511-imm">__m256i __lasx_xvrepli_d (imm_n512_511 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_75">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_75">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_75">Operation</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvreplgr2vr_b-int-val">__m256i __lasx_xvreplgr2vr_b (int val)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_76">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_76">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_76">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_72">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvreplgr2vr_h-int-val">__m256i __lasx_xvreplgr2vr_h (int val)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_77">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_77">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_77">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_73">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvreplgr2vr_w-int-val">__m256i __lasx_xvreplgr2vr_w (int val)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_78">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_78">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_78">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_74">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvreplgr2vr_d-long-int-val">__m256i __lasx_xvreplgr2vr_d (long int val)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_79">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_79">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_79">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_75">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvreplve_b-__m256i-a-int-idx">__m256i __lasx_xvreplve_b (__m256i a, int idx)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_80">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_80">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_80">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_76">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvreplve_h-__m256i-a-int-idx">__m256i __lasx_xvreplve_h (__m256i a, int idx)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_81">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_81">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_81">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_77">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvreplve_w-__m256i-a-int-idx">__m256i __lasx_xvreplve_w (__m256i a, int idx)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_82">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_82">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_82">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_78">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvreplve_d-__m256i-a-int-idx">__m256i __lasx_xvreplve_d (__m256i a, int idx)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_83">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_83">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_83">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_79">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvreplve0_b-__m256i-a">__m256i __lasx_xvreplve0_b (__m256i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_84">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_84">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_84">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_80">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvreplve0_h-__m256i-a">__m256i __lasx_xvreplve0_h (__m256i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_85">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_85">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_85">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_81">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvreplve0_w-__m256i-a">__m256i __lasx_xvreplve0_w (__m256i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_86">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_86">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_86">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_82">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvreplve0_d-__m256i-a">__m256i __lasx_xvreplve0_d (__m256i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_87">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_87">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_87">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_83">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvreplve0_q-__m256i-a">__m256i __lasx_xvreplve0_q (__m256i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_88">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_88">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_88">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_84">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvrepl128vei_b-__m256i-a-imm0_15-idx">__m256i __lasx_xvrepl128vei_b (__m256i a, imm0_15 idx)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_89">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_89">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_89">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_85">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvrepl128vei_h-__m256i-a-imm0_7-idx">__m256i __lasx_xvrepl128vei_h (__m256i a, imm0_7 idx)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_90">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_90">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_90">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_86">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvrepl128vei_w-__m256i-a-imm0_3-idx">__m256i __lasx_xvrepl128vei_w (__m256i a, imm0_3 idx)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_91">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_91">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_91">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_87">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvrepl128vei_d-__m256i-a-imm0_1-idx">__m256i __lasx_xvrepl128vei_d (__m256i a, imm0_1 idx)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_92">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_92">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_92">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_88">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsat_b-__m256i-a-imm0_7-imm">__m256i __lasx_xvsat_b (__m256i a, imm0_7 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_93">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_93">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_93">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_89">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsat_bu-__m256i-a-imm0_7-imm">__m256i __lasx_xvsat_bu (__m256i a, imm0_7 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_94">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_94">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_94">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_90">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsat_h-__m256i-a-imm0_15-imm">__m256i __lasx_xvsat_h (__m256i a, imm0_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_95">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_95">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_95">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_91">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsat_hu-__m256i-a-imm0_15-imm">__m256i __lasx_xvsat_hu (__m256i a, imm0_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_96">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_96">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_96">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_92">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsat_w-__m256i-a-imm0_31-imm">__m256i __lasx_xvsat_w (__m256i a, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_97">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_97">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_97">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_93">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsat_wu-__m256i-a-imm0_31-imm">__m256i __lasx_xvsat_wu (__m256i a, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_98">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_98">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_98">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_94">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsat_d-__m256i-a-imm0_63-imm">__m256i __lasx_xvsat_d (__m256i a, imm0_63 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_99">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_99">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_99">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_95">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsat_du-__m256i-a-imm0_63-imm">__m256i __lasx_xvsat_du (__m256i a, imm0_63 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_100">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_100">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_100">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_96">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsigncov_b-__m256i-a-__m256i-b">__m256i __lasx_xvsigncov_b (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_101">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_101">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_101">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_97">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsigncov_h-__m256i-a-__m256i-b">__m256i __lasx_xvsigncov_h (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_102">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_102">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_102">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_98">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsigncov_w-__m256i-a-__m256i-b">__m256i __lasx_xvsigncov_w (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_103">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_103">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_103">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_99">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsigncov_d-__m256i-a-__m256i-b">__m256i __lasx_xvsigncov_d (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_104">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_104">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_104">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_100">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvldi-imm_n1024_1023-imm">__m256i __lasx_xvldi (imm_n1024_1023 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_105">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_105">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_105">Operation</a>
+    </li>
+        </ul>
+    </li>
+    </ul>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../permutation/">Permutation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../shift/">Shift</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../shuffling/">Shuffling</a>
+                  </li>
+              </ul>
+              <p class="caption"><span class="caption-text">Lsx</span></p>
+              <ul>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/bitwise_operations/">Bitwise Operations</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/branch/">Branch</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/float_comparison/">Floating Point Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/float_computation/">Floating Point Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/float_conversion/">Floating Point Conversion</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/float_misc/">Floating Point Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/fma/">Fused Multiply-Add</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/integer_comparison/">Integer Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/integer_computation/">Integer Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/logical/">Logical</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/memory/">Memory Load & Store</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/misc/">Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/permutation/">Permutation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/shift/">Shift</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/shuffling/">Shuffling</a>
+                  </li>
+              </ul>
+      </div>
+    </div>
+    </nav>
+
+    <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
+      <nav class="wy-nav-top" role="navigation" aria-label="Mobile navigation menu">
+          <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
+          <a href="../..">Unofficial LoongArch Intrinsics Guide</a>
+        
+      </nav>
+      <div class="wy-nav-content">
+        <div class="rst-content"><div role="navigation" aria-label="breadcrumbs navigation">
+  <ul class="wy-breadcrumbs">
+    <li><a href="../.." class="icon icon-home" aria-label="Docs"></a></li>
+          <li class="breadcrumb-item">Lasx</li>
+      <li class="breadcrumb-item active">Misc</li>
+    <li class="wy-breadcrumbs-aside">
+    </li>
+  </ul>
+  <hr/>
+</div>
+          <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
+            <div class="section" itemprop="articleBody">
+              
+                <h1 id="misc">Misc</h1>
+<h2 id="__m256i-__lasx_xvexth_h_b-__m256i-a">__m256i __lasx_xvexth_h_b (__m256i a)</h2>
+<h3 id="synopsis">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvexth_h_b (__m256i a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvexth.h.b xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description">Description</h3>
+<p>Extend signed 8-bit elements in the higher half of <code>a</code> to 16-bit.</p>
+<h3 id="operation">Operation</h3>
+<pre><code class="language-c++">int i;
+for (i = 0; i &lt; 8; i++) {
+  dst.half[i] = (s16)(s8)a.byte[8 + i];
+}
+for (; i &lt; 16; i++) {
+  dst.half[i] = (s16)(s8)a.byte[16 + i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvexth_hu_bu-__m256i-a">__m256i __lasx_xvexth_hu_bu (__m256i a)</h2>
+<h3 id="synopsis_1">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvexth_hu_bu (__m256i a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvexth.hu.bu xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_1">Description</h3>
+<p>Extend unsigned 8-bit elements in the higher half of <code>a</code> to 16-bit.</p>
+<h3 id="operation_1">Operation</h3>
+<pre><code class="language-c++">int i;
+for (i = 0; i &lt; 8; i++) {
+  dst.half[i] = (u16)(u8)a.byte[8 + i];
+}
+for (; i &lt; 16; i++) {
+  dst.half[i] = (u16)(u8)a.byte[16 + i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_1">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvexth_w_h-__m256i-a">__m256i __lasx_xvexth_w_h (__m256i a)</h2>
+<h3 id="synopsis_2">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvexth_w_h (__m256i a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvexth.w.h xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_2">Description</h3>
+<p>Extend signed 16-bit elements in the higher half of <code>a</code> to 32-bit.</p>
+<h3 id="operation_2">Operation</h3>
+<pre><code class="language-c++">int i;
+for (i = 0; i &lt; 4; i++) {
+  dst.word[i] = (s32)(s16)a.half[4 + i];
+}
+for (; i &lt; 8; i++) {
+  dst.word[i] = (s32)(s16)a.half[8 + i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_2">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvexth_wu_hu-__m256i-a">__m256i __lasx_xvexth_wu_hu (__m256i a)</h2>
+<h3 id="synopsis_3">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvexth_wu_hu (__m256i a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvexth.wu.hu xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_3">Description</h3>
+<p>Extend unsigned 16-bit elements in the higher half of <code>a</code> to 32-bit.</p>
+<h3 id="operation_3">Operation</h3>
+<pre><code class="language-c++">int i;
+for (i = 0; i &lt; 4; i++) {
+  dst.word[i] = (u32)(u16)a.half[4 + i];
+}
+for (; i &lt; 8; i++) {
+  dst.word[i] = (u32)(u16)a.half[8 + i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_3">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvexth_d_w-__m256i-a">__m256i __lasx_xvexth_d_w (__m256i a)</h2>
+<h3 id="synopsis_4">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvexth_d_w (__m256i a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvexth.d.w xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_4">Description</h3>
+<p>Extend signed 32-bit elements in the higher half of <code>a</code> to 64-bit.</p>
+<h3 id="operation_4">Operation</h3>
+<pre><code class="language-c++">int i;
+for (i = 0; i &lt; 2; i++) {
+  dst.dword[i] = (s64)(s32)a.word[2 + i];
+}
+for (; i &lt; 4; i++) {
+  dst.dword[i] = (s64)(s32)a.word[4 + i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_4">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvexth_du_wu-__m256i-a">__m256i __lasx_xvexth_du_wu (__m256i a)</h2>
+<h3 id="synopsis_5">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvexth_du_wu (__m256i a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvexth.du.wu xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_5">Description</h3>
+<p>Extend unsigned 32-bit elements in the higher half of <code>a</code> to 64-bit.</p>
+<h3 id="operation_5">Operation</h3>
+<pre><code class="language-c++">int i;
+for (i = 0; i &lt; 2; i++) {
+  dst.dword[i] = (u64)(u32)a.word[2 + i];
+}
+for (; i &lt; 4; i++) {
+  dst.dword[i] = (u64)(u32)a.word[4 + i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_5">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvexth_q_d-__m256i-a">__m256i __lasx_xvexth_q_d (__m256i a)</h2>
+<h3 id="synopsis_6">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvexth_q_d (__m256i a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvexth.q.d xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_6">Description</h3>
+<p>Extend signed 64-bit elements in the higher half of <code>a</code> to 128-bit.</p>
+<h3 id="operation_6">Operation</h3>
+<pre><code class="language-c++">int i;
+for (i = 0; i &lt; 1; i++) {
+  dst.qword[i] = (s128)(s64)a.dword[1 + i];
+}
+for (; i &lt; 2; i++) {
+  dst.qword[i] = (s128)(s64)a.dword[2 + i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_6">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvexth_qu_du-__m256i-a">__m256i __lasx_xvexth_qu_du (__m256i a)</h2>
+<h3 id="synopsis_7">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvexth_qu_du (__m256i a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvexth.qu.du xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_7">Description</h3>
+<p>Extend unsigned 64-bit elements in the higher half of <code>a</code> to 128-bit.</p>
+<h3 id="operation_7">Operation</h3>
+<pre><code class="language-c++">int i;
+for (i = 0; i &lt; 1; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[1 + i];
+}
+for (; i &lt; 2; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[2 + i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_7">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvextl_q_d-__m256i-a">__m256i __lasx_xvextl_q_d (__m256i a)</h2>
+<h3 id="synopsis_8">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvextl_q_d (__m256i a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvextl.q.d xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_8">Description</h3>
+<p>Extend signed 64-bit elements in the lower half of <code>a</code> to 128-bit.</p>
+<h3 id="operation_8">Operation</h3>
+<pre><code class="language-c++">int i;
+for (i = 0; i &lt; 1; i++) {
+  dst.qword[i] = (s128)(s64)a.dword[i];
+}
+for (; i &lt; 2; i++) {
+  dst.qword[i] = (s128)(s64)a.dword[i + 1];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_8">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvextl_qu_du-__m256i-a">__m256i __lasx_xvextl_qu_du (__m256i a)</h2>
+<h3 id="synopsis_9">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvextl_qu_du (__m256i a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvextl.qu.du xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_9">Description</h3>
+<p>Extend unsigned 64-bit elements in the lower half of <code>a</code> to 128-bit.</p>
+<h3 id="operation_9">Operation</h3>
+<pre><code class="language-c++">int i;
+for (i = 0; i &lt; 1; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[i];
+}
+for (; i &lt; 2; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[i + 1];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_9">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvextrins_b-__m256i-a-__m256i-b-imm0_255-imm">__m256i __lasx_xvextrins_b (__m256i a, __m256i b, imm0_255 imm)</h2>
+<h3 id="synopsis_10">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvextrins_b (__m256i a, __m256i b, imm0_255 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvextrins.b xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_10">Description</h3>
+<p>Extract one 8-bit element in <code>b</code> and insert it to <code>a</code> according to <code>imm</code>.</p>
+<h3 id="operation_10">Operation</h3>
+<pre><code class="language-c++">int i;
+for (i = 0; i &lt; 16; i++) {
+  dst.byte[i] = (i == ((imm &gt;&gt; 4) &amp; 15)) ? b.byte[imm &amp; 15] : a.byte[i];
+}
+for (; i &lt; 32; i++) {
+  dst.byte[i] =
+      (i - 16 == ((imm &gt;&gt; 4) &amp; 15)) ? b.byte[(imm &amp; 15) + 16] : a.byte[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_10">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvextrins_h-__m256i-a-__m256i-b-imm0_255-imm">__m256i __lasx_xvextrins_h (__m256i a, __m256i b, imm0_255 imm)</h2>
+<h3 id="synopsis_11">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvextrins_h (__m256i a, __m256i b, imm0_255 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvextrins.h xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_11">Description</h3>
+<p>Extract one 16-bit element in <code>b</code> and insert it to <code>a</code> according to <code>imm</code>.</p>
+<h3 id="operation_11">Operation</h3>
+<pre><code class="language-c++">int i;
+for (i = 0; i &lt; 8; i++) {
+  dst.half[i] = (i == ((imm &gt;&gt; 4) &amp; 7)) ? b.half[imm &amp; 7] : a.half[i];
+}
+for (; i &lt; 16; i++) {
+  dst.half[i] = (i - 8 == ((imm &gt;&gt; 4) &amp; 7)) ? b.half[(imm &amp; 7) + 8] : a.half[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_11">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvextrins_w-__m256i-a-__m256i-b-imm0_255-imm">__m256i __lasx_xvextrins_w (__m256i a, __m256i b, imm0_255 imm)</h2>
+<h3 id="synopsis_12">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvextrins_w (__m256i a, __m256i b, imm0_255 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvextrins.w xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_12">Description</h3>
+<p>Extract one 32-bit element in <code>b</code> and insert it to <code>a</code> according to <code>imm</code>.</p>
+<h3 id="operation_12">Operation</h3>
+<pre><code class="language-c++">int i;
+for (i = 0; i &lt; 4; i++) {
+  dst.word[i] = (i == ((imm &gt;&gt; 4) &amp; 3)) ? b.word[imm &amp; 3] : a.word[i];
+}
+for (; i &lt; 8; i++) {
+  dst.word[i] = (i - 4 == ((imm &gt;&gt; 4) &amp; 3)) ? b.word[(imm &amp; 3) + 4] : a.word[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_12">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvextrins_d-__m256i-a-__m256i-b-imm0_255-imm">__m256i __lasx_xvextrins_d (__m256i a, __m256i b, imm0_255 imm)</h2>
+<h3 id="synopsis_13">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvextrins_d (__m256i a, __m256i b, imm0_255 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvextrins.d xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_13">Description</h3>
+<p>Extract one 64-bit element in <code>b</code> and insert it to <code>a</code> according to <code>imm</code>.</p>
+<h3 id="operation_13">Operation</h3>
+<pre><code class="language-c++">int i;
+for (i = 0; i &lt; 2; i++) {
+  dst.dword[i] = (i == ((imm &gt;&gt; 4) &amp; 1)) ? b.dword[imm &amp; 1] : a.dword[i];
+}
+for (; i &lt; 4; i++) {
+  dst.dword[i] =
+      (i - 2 == ((imm &gt;&gt; 4) &amp; 1)) ? b.dword[(imm &amp; 1) + 2] : a.dword[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_13">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_vext2xv_h_b-__m256i-a">__m256i __lasx_vext2xv_h_b (__m256i a)</h2>
+<h3 id="synopsis_14">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_vext2xv_h_b (__m256i a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vext2xv.h.b xr, xr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_14">Description</h3>
+<p>Extend signed 8-bit lane of <code>a</code> to signed 16-bit elements.</p>
+<h3 id="operation_14">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = (s16)(s8)a.byte[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_14">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_vext2xv_hu_bu-__m256i-a">__m256i __lasx_vext2xv_hu_bu (__m256i a)</h2>
+<h3 id="synopsis_15">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_vext2xv_hu_bu (__m256i a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vext2xv.hu.bu xr, xr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_15">Description</h3>
+<p>Extend unsigned 8-bit lane of <code>a</code> to unsigned 16-bit elements.</p>
+<h3 id="operation_15">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = (u16)(u8)a.byte[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_15">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_vext2xv_w_b-__m256i-a">__m256i __lasx_vext2xv_w_b (__m256i a)</h2>
+<h3 id="synopsis_16">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_vext2xv_w_b (__m256i a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vext2xv.w.b xr, xr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_16">Description</h3>
+<p>Extend signed 8-bit lane of <code>a</code> to signed 32-bit elements.</p>
+<h3 id="operation_16">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = (s32)(s8)a.byte[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_16">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_vext2xv_wu_bu-__m256i-a">__m256i __lasx_vext2xv_wu_bu (__m256i a)</h2>
+<h3 id="synopsis_17">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_vext2xv_wu_bu (__m256i a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vext2xv.wu.bu xr, xr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_17">Description</h3>
+<p>Extend unsigned 8-bit lane of <code>a</code> to unsigned 32-bit elements.</p>
+<h3 id="operation_17">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = (u32)(u8)a.byte[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_17">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_vext2xv_w_h-__m256i-a">__m256i __lasx_vext2xv_w_h (__m256i a)</h2>
+<h3 id="synopsis_18">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_vext2xv_w_h (__m256i a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vext2xv.w.h xr, xr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_18">Description</h3>
+<p>Extend signed 16-bit lane of <code>a</code> to signed 32-bit elements.</p>
+<h3 id="operation_18">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = (s32)(s16)a.half[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_18">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_vext2xv_wu_hu-__m256i-a">__m256i __lasx_vext2xv_wu_hu (__m256i a)</h2>
+<h3 id="synopsis_19">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_vext2xv_wu_hu (__m256i a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vext2xv.wu.hu xr, xr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_19">Description</h3>
+<p>Extend unsigned 16-bit lane of <code>a</code> to unsigned 32-bit elements.</p>
+<h3 id="operation_19">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = (u32)(u16)a.half[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_19">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_vext2xv_d_b-__m256i-a">__m256i __lasx_vext2xv_d_b (__m256i a)</h2>
+<h3 id="synopsis_20">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_vext2xv_d_b (__m256i a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vext2xv.d.b xr, xr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_20">Description</h3>
+<p>Extend signed 8-bit lane of <code>a</code> to signed 64-bit elements.</p>
+<h3 id="operation_20">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = (s64)(s8)a.byte[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_20">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_vext2xv_du_bu-__m256i-a">__m256i __lasx_vext2xv_du_bu (__m256i a)</h2>
+<h3 id="synopsis_21">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_vext2xv_du_bu (__m256i a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vext2xv.du.bu xr, xr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_21">Description</h3>
+<p>Extend unsigned 8-bit lane of <code>a</code> to unsigned 64-bit elements.</p>
+<h3 id="operation_21">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = (u64)(u8)a.byte[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_21">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_vext2xv_d_h-__m256i-a">__m256i __lasx_vext2xv_d_h (__m256i a)</h2>
+<h3 id="synopsis_22">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_vext2xv_d_h (__m256i a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vext2xv.d.h xr, xr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_22">Description</h3>
+<p>Extend signed 16-bit lane of <code>a</code> to signed 64-bit elements.</p>
+<h3 id="operation_22">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = (s64)(s16)a.half[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_22">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_vext2xv_du_hu-__m256i-a">__m256i __lasx_vext2xv_du_hu (__m256i a)</h2>
+<h3 id="synopsis_23">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_vext2xv_du_hu (__m256i a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vext2xv.du.hu xr, xr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_23">Description</h3>
+<p>Extend unsigned 16-bit lane of <code>a</code> to unsigned 64-bit elements.</p>
+<h3 id="operation_23">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = (u64)(u16)a.half[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_23">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_vext2xv_d_w-__m256i-a">__m256i __lasx_vext2xv_d_w (__m256i a)</h2>
+<h3 id="synopsis_24">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_vext2xv_d_w (__m256i a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vext2xv.d.w xr, xr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_24">Description</h3>
+<p>Extend signed 32-bit lane of <code>a</code> to signed 64-bit elements.</p>
+<h3 id="operation_24">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = (s64)(s32)a.word[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_24">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_vext2xv_du_wu-__m256i-a">__m256i __lasx_vext2xv_du_wu (__m256i a)</h2>
+<h3 id="synopsis_25">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_vext2xv_du_wu (__m256i a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vext2xv.du.wu xr, xr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_25">Description</h3>
+<p>Extend unsigned 32-bit lane of <code>a</code> to unsigned 64-bit elements.</p>
+<h3 id="operation_25">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = (u64)(u32)a.word[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_25">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvilvh_b-__m256i-a-__m256i-b">__m256i __lasx_xvilvh_b (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_26">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvilvh_b (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvilvh.b xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_26">Description</h3>
+<p>Interleave 8-bit elements in higher half of <code>a</code> and <code>b</code>.</p>
+<h3 id="operation_26">Operation</h3>
+<pre><code class="language-c++">int i;
+for (i = 0; i &lt; 16; i++) {
+  dst.byte[i] = (i % 2 == 1) ? a.byte[i / 2 + 8] : b.byte[i / 2 + 8];
+}
+for (; i &lt; 32; i++) {
+  dst.byte[i] = (i % 2 == 1) ? a.byte[i / 2 + 16] : b.byte[i / 2 + 16];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_26">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvilvh_h-__m256i-a-__m256i-b">__m256i __lasx_xvilvh_h (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_27">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvilvh_h (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvilvh.h xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_27">Description</h3>
+<p>Interleave 16-bit elements in higher half of <code>a</code> and <code>b</code>.</p>
+<h3 id="operation_27">Operation</h3>
+<pre><code class="language-c++">int i;
+for (i = 0; i &lt; 8; i++) {
+  dst.half[i] = (i % 2 == 1) ? a.half[i / 2 + 4] : b.half[i / 2 + 4];
+}
+for (; i &lt; 16; i++) {
+  dst.half[i] = (i % 2 == 1) ? a.half[i / 2 + 8] : b.half[i / 2 + 8];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_27">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvilvh_w-__m256i-a-__m256i-b">__m256i __lasx_xvilvh_w (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_28">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvilvh_w (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvilvh.w xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_28">Description</h3>
+<p>Interleave 32-bit elements in higher half of <code>a</code> and <code>b</code>.</p>
+<h3 id="operation_28">Operation</h3>
+<pre><code class="language-c++">int i;
+for (i = 0; i &lt; 4; i++) {
+  dst.word[i] = (i % 2 == 1) ? a.word[i / 2 + 2] : b.word[i / 2 + 2];
+}
+for (; i &lt; 8; i++) {
+  dst.word[i] = (i % 2 == 1) ? a.word[i / 2 + 4] : b.word[i / 2 + 4];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_28">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvilvh_d-__m256i-a-__m256i-b">__m256i __lasx_xvilvh_d (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_29">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvilvh_d (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvilvh.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_29">Description</h3>
+<p>Interleave 64-bit elements in higher half of <code>a</code> and <code>b</code>.</p>
+<h3 id="operation_29">Operation</h3>
+<pre><code class="language-c++">int i;
+for (i = 0; i &lt; 2; i++) {
+  dst.dword[i] = (i % 2 == 1) ? a.dword[i / 2 + 1] : b.dword[i / 2 + 1];
+}
+for (; i &lt; 4; i++) {
+  dst.dword[i] = (i % 2 == 1) ? a.dword[i / 2 + 2] : b.dword[i / 2 + 2];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_29">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvilvl_b-__m256i-a-__m256i-b">__m256i __lasx_xvilvl_b (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_30">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvilvl_b (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvilvl.b xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_30">Description</h3>
+<p>Interleave 8-bit elements in lower half of <code>a</code> and <code>b</code>.</p>
+<h3 id="operation_30">Operation</h3>
+<pre><code class="language-c++">int i;
+for (i = 0; i &lt; 16; i++) {
+  dst.byte[i] = (i % 2 == 1) ? a.byte[i / 2] : b.byte[i / 2];
+}
+for (; i &lt; 32; i++) {
+  dst.byte[i] = (i % 2 == 1) ? a.byte[i / 2 + 8] : b.byte[i / 2 + 8];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_30">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvilvl_h-__m256i-a-__m256i-b">__m256i __lasx_xvilvl_h (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_31">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvilvl_h (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvilvl.h xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_31">Description</h3>
+<p>Interleave 16-bit elements in lower half of <code>a</code> and <code>b</code>.</p>
+<h3 id="operation_31">Operation</h3>
+<pre><code class="language-c++">int i;
+for (i = 0; i &lt; 8; i++) {
+  dst.half[i] = (i % 2 == 1) ? a.half[i / 2] : b.half[i / 2];
+}
+for (; i &lt; 16; i++) {
+  dst.half[i] = (i % 2 == 1) ? a.half[i / 2 + 4] : b.half[i / 2 + 4];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_31">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvilvl_w-__m256i-a-__m256i-b">__m256i __lasx_xvilvl_w (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_32">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvilvl_w (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvilvl.w xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_32">Description</h3>
+<p>Interleave 32-bit elements in lower half of <code>a</code> and <code>b</code>.</p>
+<h3 id="operation_32">Operation</h3>
+<pre><code class="language-c++">int i;
+for (i = 0; i &lt; 4; i++) {
+  dst.word[i] = (i % 2 == 1) ? a.word[i / 2] : b.word[i / 2];
+}
+for (; i &lt; 8; i++) {
+  dst.word[i] = (i % 2 == 1) ? a.word[i / 2 + 2] : b.word[i / 2 + 2];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_32">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvilvl_d-__m256i-a-__m256i-b">__m256i __lasx_xvilvl_d (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_33">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvilvl_d (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvilvl.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_33">Description</h3>
+<p>Interleave 64-bit elements in lower half of <code>a</code> and <code>b</code>.</p>
+<h3 id="operation_33">Operation</h3>
+<pre><code class="language-c++">int i;
+for (i = 0; i &lt; 2; i++) {
+  dst.dword[i] = (i % 2 == 1) ? a.dword[i / 2] : b.dword[i / 2];
+}
+for (; i &lt; 4; i++) {
+  dst.dword[i] = (i % 2 == 1) ? a.dword[i / 2 + 1] : b.dword[i / 2 + 1];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_33">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvinsgr2vr_w-__m256i-a-int-b-imm0_7-imm">__m256i __lasx_xvinsgr2vr_w (__m256i a, int b, imm0_7 imm)</h2>
+<h3 id="synopsis_34">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvinsgr2vr_w (__m256i a, int b, imm0_7 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvinsgr2vr.w xr, r, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_34">Description</h3>
+<p>Insert 32-bit element into lane indexed <code>imm</code>.</p>
+<h3 id="operation_34">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = (i == imm) ? b : a.word[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_34">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>1</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvinsgr2vr_d-__m256i-a-long-int-b-imm0_3-imm">__m256i __lasx_xvinsgr2vr_d (__m256i a, long int b, imm0_3 imm)</h2>
+<h3 id="synopsis_35">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvinsgr2vr_d (__m256i a, long int b, imm0_3 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvinsgr2vr.d xr, r, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_35">Description</h3>
+<p>Insert 64-bit element into lane indexed <code>imm</code>.</p>
+<h3 id="operation_35">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = (i == imm) ? b : a.dword[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_35">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>1</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvinsve0_w-__m256i-a-__m256i-b-imm0_7-imm">__m256i __lasx_xvinsve0_w (__m256i a, __m256i b, imm0_7 imm)</h2>
+<h3 id="synopsis_36">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvinsve0_w (__m256i a, __m256i b, imm0_7 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvinsve0.w xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_36">Description</h3>
+<p>Insert the first 32-bit lane of <code>b</code> into lane indexed <code>imm</code> of <code>a</code>.</p>
+<h3 id="operation_36">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = (i == imm) ? b.word[0] : a.word[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_36">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvinsve0_d-__m256i-a-__m256i-b-imm0_3-imm">__m256i __lasx_xvinsve0_d (__m256i a, __m256i b, imm0_3 imm)</h2>
+<h3 id="synopsis_37">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvinsve0_d (__m256i a, __m256i b, imm0_3 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvinsve0.d xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_37">Description</h3>
+<p>Insert the first 64-bit lane of <code>b</code> into lane indexed <code>imm</code> of <code>a</code>.</p>
+<h3 id="operation_37">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = (i == imm) ? b.dword[0] : a.dword[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_37">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvfrstp_b-__m256i-a-__m256i-b-__m256i-c">__m256i __lasx_xvfrstp_b (__m256i a, __m256i b, __m256i c)</h2>
+<h3 id="synopsis_38">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvfrstp_b (__m256i a, __m256i b, __m256i c)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfrstp.b xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_38">Description</h3>
+<p>Find the first negative 8-bit element in <code>b</code>, set the index of the element to the lane of <code>a</code> specified by <code>c</code>.</p>
+<h3 id="operation_38">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 32; i++) {
+  dst.byte[i] = a.byte[i];
+}
+int i;
+for (i = 0; i &lt; 16; i++) {
+  if ((s8)b.byte[i] &lt; 0) {
+    break;
+  }
+}
+dst.byte[c.byte[0] % 16] = i;
+for (i = 16; i &lt; 32; i++) {
+  if ((s8)b.byte[i] &lt; 0) {
+    break;
+  }
+}
+dst.byte[(c.byte[16] % 16) + 16] = i - 16;
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_38">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvfrstp_h-__m256i-a-__m256i-b-__m256i-c">__m256i __lasx_xvfrstp_h (__m256i a, __m256i b, __m256i c)</h2>
+<h3 id="synopsis_39">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvfrstp_h (__m256i a, __m256i b, __m256i c)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfrstp.h xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_39">Description</h3>
+<p>Find the first negative 16-bit element in <code>b</code>, set the index of the element to the lane of <code>a</code> specified by <code>c</code>.</p>
+<h3 id="operation_39">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = a.half[i];
+}
+int i;
+for (i = 0; i &lt; 8; i++) {
+  if ((s16)b.half[i] &lt; 0) {
+    break;
+  }
+}
+dst.half[c.half[0] % 8] = i;
+for (i = 8; i &lt; 16; i++) {
+  if ((s16)b.half[i] &lt; 0) {
+    break;
+  }
+}
+dst.half[(c.half[8] % 8) + 8] = i - 8;
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_39">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvfrstpi_b-__m256i-a-__m256i-b-imm0_31-imm">__m256i __lasx_xvfrstpi_b (__m256i a, __m256i b, imm0_31 imm)</h2>
+<h3 id="synopsis_40">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvfrstpi_b (__m256i a, __m256i b, imm0_31 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfrstpi.b xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_40">Description</h3>
+<p>Find the first negative 8-bit element in <code>b</code>, set the index of the element to the lane of <code>a</code> specified by <code>imm</code>.</p>
+<h3 id="operation_40">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 32; i++) {
+  dst.byte[i] = a.byte[i];
+}
+int i;
+for (i = 0; i &lt; 16; i++) {
+  if ((s8)b.byte[i] &lt; 0) {
+    break;
+  }
+}
+dst.byte[imm % 16] = i;
+for (i = 16; i &lt; 32; i++) {
+  if ((s8)b.byte[i] &lt; 0) {
+    break;
+  }
+}
+dst.byte[(imm % 16) + 16] = i - 16;
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_40">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvfrstpi_h-__m256i-a-__m256i-b-imm0_31-imm">__m256i __lasx_xvfrstpi_h (__m256i a, __m256i b, imm0_31 imm)</h2>
+<h3 id="synopsis_41">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvfrstpi_h (__m256i a, __m256i b, imm0_31 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvfrstpi.h xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_41">Description</h3>
+<p>Find the first negative 16-bit element in <code>b</code>, set the index of the element to the lane of <code>a</code> specified by <code>imm</code>.</p>
+<h3 id="operation_41">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = a.half[i];
+}
+int i;
+for (i = 0; i &lt; 8; i++) {
+  if ((s16)b.half[i] &lt; 0) {
+    break;
+  }
+}
+dst.half[imm % 8] = i;
+for (i = 8; i &lt; 16; i++) {
+  if ((s16)b.half[i] &lt; 0) {
+    break;
+  }
+}
+dst.half[(imm % 8) + 8] = i - 8;
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_41">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmskgez_b-__m256i-a">__m256i __lasx_xvmskgez_b (__m256i a)</h2>
+<h3 id="synopsis_42">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmskgez_b (__m256i a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmskgez.b xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_42">Description</h3>
+<p>For each 8-bit element in <code>a</code>, if the element is greater than or equal to zero, set one bit in <code>dst</code>, otherwise clear it.</p>
+<h3 id="examples">Examples</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmskgez_b(__m256i{0x1122334455667788, 0x99aabbccddeeff00, 0xabababab12121212, 0x1234567812345678})
+= 0x00000000000001fe 0x0000000000000000 0x000000000000ff0f 0x0000000000000000
+__m256i __lasx_xvmskgez_b(__m256i{0x0000191100000000, 0x00a1000011b11c11, 0x1181000008010101, 0x0000000000000000})
+= 0x000000000000bbff 0x0000000000000000 0x000000000000ffbf 0x0000000000000000
+</code></pre>
+<h3 id="operation_42">Operation</h3>
+<pre><code class="language-c++">u64 m = 0x8080808080808080;
+u64 c = m &amp; a.dword[0];
+c |= c &lt;&lt; 7;
+c |= c &lt;&lt; 14;
+c |= c &lt;&lt; 28;
+c &gt;&gt;= 56;
+dst.dword[0] = c;
+c = m &amp; a.dword[1];
+c |= c &lt;&lt; 7;
+c |= c &lt;&lt; 14;
+c |= c &lt;&lt; 28;
+c &gt;&gt;= 56;
+dst.dword[0] |= c &lt;&lt; 8;
+dst.dword[0] = (u16)~dst.dword[0];
+dst.dword[1] = 0;
+
+c = m &amp; a.dword[2];
+c |= c &lt;&lt; 7;
+c |= c &lt;&lt; 14;
+c |= c &lt;&lt; 28;
+c &gt;&gt;= 56;
+dst.dword[2] = c;
+c = m &amp; a.dword[3];
+c |= c &lt;&lt; 7;
+c |= c &lt;&lt; 14;
+c |= c &lt;&lt; 28;
+c &gt;&gt;= 56;
+dst.dword[2] |= c &lt;&lt; 8;
+dst.dword[2] = (u16)~dst.dword[2];
+dst.dword[3] = 0;
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_42">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmskltz_b-__m256i-a">__m256i __lasx_xvmskltz_b (__m256i a)</h2>
+<h3 id="synopsis_43">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmskltz_b (__m256i a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmskltz.b xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_43">Description</h3>
+<p>For each 8-bit element in <code>a</code>, if the element is less than zero, set one bit in <code>dst</code>, otherwise clear it.</p>
+<h3 id="examples_1">Examples</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmskltz_b(__m256i{0x1122334455667788, 0x99aabbccddeeff00, 0xabababab12121212, 0x1234567812345678})
+= 0x000000000000fe01 0x0000000000000000 0x00000000000000f0 0x0000000000000000
+__m256i __lasx_xvmskltz_b(__m256i{0x0000118100000000, 0x0081000081111118, 0x1181000001010801, 0x0000000000000000})
+= 0x0000000000004810 0x0000000000000000 0x0000000000000040 0x0000000000000000
+</code></pre>
+<h3 id="operation_43">Operation</h3>
+<pre><code class="language-c++">u64 m = 0x8080808080808080;
+u64 c = m &amp; a.dword[0];
+c |= c &lt;&lt; 7;
+c |= c &lt;&lt; 14;
+c |= c &lt;&lt; 28;
+c &gt;&gt;= 56;
+dst.dword[0] = c;
+c = m &amp; a.dword[1];
+c |= c &lt;&lt; 7;
+c |= c &lt;&lt; 14;
+c |= c &lt;&lt; 28;
+c &gt;&gt;= 56;
+dst.dword[0] |= c &lt;&lt; 8;
+dst.dword[1] = 0;
+
+c = m &amp; a.dword[2];
+c |= c &lt;&lt; 7;
+c |= c &lt;&lt; 14;
+c |= c &lt;&lt; 28;
+c &gt;&gt;= 56;
+dst.dword[2] = c;
+c = m &amp; a.dword[3];
+c |= c &lt;&lt; 7;
+c |= c &lt;&lt; 14;
+c |= c &lt;&lt; 28;
+c &gt;&gt;= 56;
+dst.dword[2] |= c &lt;&lt; 8;
+dst.dword[3] = 0;
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_43">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmskltz_h-__m256i-a">__m256i __lasx_xvmskltz_h (__m256i a)</h2>
+<h3 id="synopsis_44">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmskltz_h (__m256i a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmskltz.h xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_44">Description</h3>
+<p>For each 16-bit element in <code>a</code>, if the element is less than zero, set one bit in <code>dst</code>, otherwise clear it.</p>
+<h3 id="examples_2">Examples</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmskltz_h(__m256i{0x1122334455667788, 0x99aabbccddeeff00, 0xabababab12121212, 0x1234567812345678})
+= 0x00000000000000f0 0x0000000000000000 0x000000000000000c 0x0000000000000000
+__m256i __lasx_xvmskltz_h(__m256i{0x0000818100000000, 0x0018000018181881, 0x1181000008080808, 0x0000000000000000})
+= 0x0000000000000004 0x0000000000000000 0x0000000000000000 0x0000000000000000
+</code></pre>
+<h3 id="operation_44">Operation</h3>
+<pre><code class="language-c++">u64 m = 0x8000800080008000;
+u64 c = m &amp; a.dword[0];
+c |= c &lt;&lt; 15;
+c |= c &lt;&lt; 30;
+c &gt;&gt;= 60;
+dst.dword[0] = c;
+c = m &amp; a.dword[1];
+c |= c &lt;&lt; 15;
+c |= c &lt;&lt; 30;
+c &gt;&gt;= 60;
+dst.dword[0] |= c &lt;&lt; 4;
+dst.dword[1] = 0;
+
+c = m &amp; a.dword[2];
+c |= c &lt;&lt; 15;
+c |= c &lt;&lt; 30;
+c &gt;&gt;= 60;
+dst.dword[2] = c;
+c = m &amp; a.dword[3];
+c |= c &lt;&lt; 15;
+c |= c &lt;&lt; 30;
+c &gt;&gt;= 60;
+dst.dword[2] |= c &lt;&lt; 4;
+dst.dword[3] = 0;
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_44">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmskltz_w-__m256i-a">__m256i __lasx_xvmskltz_w (__m256i a)</h2>
+<h3 id="synopsis_45">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmskltz_w (__m256i a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmskltz.w xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_45">Description</h3>
+<p>For each 32-bit element in <code>a</code>, if the element is less than zero, set one bit in <code>dst</code>, otherwise clear it.</p>
+<h3 id="examples_3">Examples</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmskltz_w(__m256i{0x1122334455667788, 0x99aabbccddeeff00, 0xabababab12121212, 0x1234567812345678})
+= 0x000000000000000c 0x0000000000000000 0x0000000000000002 0x0000000000000000
+__m256i __lasx_xvmskltz_w(__m256i{0x0000811100000000, 0x0018000081111111, 0x8111000001010108, 0x0000000000000000})
+= 0x0000000000000004 0x0000000000000000 0x0000000000000002 0x0000000000000000
+</code></pre>
+<h3 id="operation_45">Operation</h3>
+<pre><code class="language-c++">u64 m = 0x8000000080000000;
+u64 c = m &amp; a.dword[0];
+c |= c &lt;&lt; 31;
+c &gt;&gt;= 62;
+dst.dword[0] = c;
+c = m &amp; a.dword[1];
+c |= c &lt;&lt; 31;
+c &gt;&gt;= 62;
+dst.dword[0] |= c &lt;&lt; 2;
+dst.dword[1] = 0;
+
+c = m &amp; a.dword[2];
+c |= c &lt;&lt; 31;
+c &gt;&gt;= 62;
+dst.dword[2] = c;
+c = m &amp; a.dword[3];
+c |= c &lt;&lt; 31;
+c &gt;&gt;= 62;
+dst.dword[2] |= c &lt;&lt; 2;
+dst.dword[3] = 0;
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_45">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmskltz_d-__m256i-a">__m256i __lasx_xvmskltz_d (__m256i a)</h2>
+<h3 id="synopsis_46">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmskltz_d (__m256i a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmskltz.d xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_46">Description</h3>
+<p>For each 64-bit element in <code>a</code>, if the element is less than zero, set one bit in <code>dst</code>, otherwise clear it.</p>
+<h3 id="examples_4">Examples</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmskltz_d(__m256i{0x1122334455667788, 0x99aabbccddeeff00, 0xabababab12121212, 0x1234567812345678})
+= 0x0000000000000002 0x0000000000000000 0x0000000000000001 0x0000000000000000
+__m256i __lasx_xvmskltz_d(__m256i{0x0000111800000000, 0x0081000081111111, 0x8111000008010101, 0x0000000000000000})
+= 0x0000000000000000 0x0000000000000000 0x0000000000000001 0x0000000000000000
+</code></pre>
+<h3 id="operation_46">Operation</h3>
+<pre><code class="language-c++">u64 m = 0x8000000000000000;
+u64 c = m &amp; a.dword[0];
+c &gt;&gt;= 63;
+dst.dword[0] = c;
+c = m &amp; a.dword[1];
+c &gt;&gt;= 63;
+dst.dword[0] |= c &lt;&lt; 1;
+dst.dword[1] = 0;
+
+c = m &amp; a.dword[2];
+c &gt;&gt;= 63;
+dst.dword[2] = c;
+c = m &amp; a.dword[3];
+c &gt;&gt;= 63;
+dst.dword[2] |= c &lt;&lt; 1;
+dst.dword[3] = 0;
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_46">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvmsknz_b-__m256i-a">__m256i __lasx_xvmsknz_b (__m256i a)</h2>
+<h3 id="synopsis_47">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmsknz_b (__m256i a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvmsknz.b xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_47">Description</h3>
+<p>For each 8-bit element in <code>a</code>, if the element is non-zero, set one bit in <code>dst</code>, otherwise clear it.</p>
+<h3 id="examples_5">Examples</h3>
+<pre><code class="language-c++">__m256i __lasx_xvmsknz_b(__m256i{0x1122334455667788, 0x99aabbccddeeff00, 0xabababab12121212, 0x1234567812345678})
+= 0x000000000000feff 0x0000000000000000 0x000000000000ffff 0x0000000000000000
+__m256i __lasx_xvmsknz_b(__m256i{0x0000111100000000, 0x0011000011111111, 0x1111000001010101, 0x0000000000000000})
+= 0x0000000000004f30 0x0000000000000000 0x00000000000000cf 0x0000000000000000
+</code></pre>
+<h3 id="operation_47">Operation</h3>
+<pre><code class="language-c++">u64 m = 0x7F7F7F7F7F7F7F7F;
+u64 c = ~(((a.dword[0] &amp; m) + m) | a.dword[0] | m);
+c |= c &lt;&lt; 7;
+c |= c &lt;&lt; 14;
+c |= c &lt;&lt; 28;
+c &gt;&gt;= 56;
+dst.dword[0] = c;
+c = ~(((a.dword[1] &amp; m) + m) | a.dword[1] | m);
+c |= c &lt;&lt; 7;
+c |= c &lt;&lt; 14;
+c |= c &lt;&lt; 28;
+c &gt;&gt;= 56;
+dst.dword[0] |= c &lt;&lt; 8;
+dst.dword[0] = (u16)~dst.dword[0];
+dst.dword[1] = 0;
+
+c = ~(((a.dword[2] &amp; m) + m) | a.dword[2] | m);
+c |= c &lt;&lt; 7;
+c |= c &lt;&lt; 14;
+c |= c &lt;&lt; 28;
+c &gt;&gt;= 56;
+dst.dword[2] = c;
+c = ~(((a.dword[3] &amp; m) + m) | a.dword[3] | m);
+c |= c &lt;&lt; 7;
+c |= c &lt;&lt; 14;
+c |= c &lt;&lt; 28;
+c &gt;&gt;= 56;
+dst.dword[2] |= c &lt;&lt; 8;
+dst.dword[2] = (u16)~dst.dword[2];
+dst.dword[3] = 0;
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_47">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvpackev_b-__m256i-a-__m256i-b">__m256i __lasx_xvpackev_b (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_48">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvpackev_b (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvpackev.b xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_48">Description</h3>
+<p>Collect and pack even-positioned 8-bit elements in <code>a</code> and <code>b</code> and store <code>dst</code>.</p>
+<h3 id="operation_48">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 32; i++) {
+  dst.byte[i] = (i % 2 == 1) ? a.byte[i - 1] : b.byte[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_48">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvpackev_h-__m256i-a-__m256i-b">__m256i __lasx_xvpackev_h (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_49">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvpackev_h (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvpackev.h xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_49">Description</h3>
+<p>Collect and pack even-positioned 16-bit elements in <code>a</code> and <code>b</code> and store <code>dst</code>.</p>
+<h3 id="operation_49">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = (i % 2 == 1) ? a.half[i - 1] : b.half[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_49">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvpackev_w-__m256i-a-__m256i-b">__m256i __lasx_xvpackev_w (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_50">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvpackev_w (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvpackev.w xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_50">Description</h3>
+<p>Collect and pack even-positioned 32-bit elements in <code>a</code> and <code>b</code> and store <code>dst</code>.</p>
+<h3 id="operation_50">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = (i % 2 == 1) ? a.word[i - 1] : b.word[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_50">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvpackev_d-__m256i-a-__m256i-b">__m256i __lasx_xvpackev_d (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_51">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvpackev_d (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvpackev.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_51">Description</h3>
+<p>Collect and pack even-positioned 64-bit elements in <code>a</code> and <code>b</code> and store <code>dst</code>.</p>
+<h3 id="operation_51">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = (i % 2 == 1) ? a.dword[i - 1] : b.dword[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_51">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvpackod_b-__m256i-a-__m256i-b">__m256i __lasx_xvpackod_b (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_52">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvpackod_b (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvpackod.b xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_52">Description</h3>
+<p>Collect and pack odd-positioned 8-bit elements in <code>a</code> and <code>b</code> and store <code>dst</code>.</p>
+<h3 id="operation_52">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 32; i++) {
+  dst.byte[i] = (i % 2 == 1) ? a.byte[i] : b.byte[i + 1];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_52">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvpackod_h-__m256i-a-__m256i-b">__m256i __lasx_xvpackod_h (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_53">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvpackod_h (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvpackod.h xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_53">Description</h3>
+<p>Collect and pack odd-positioned 16-bit elements in <code>a</code> and <code>b</code> and store <code>dst</code>.</p>
+<h3 id="operation_53">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = (i % 2 == 1) ? a.half[i] : b.half[i + 1];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_53">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvpackod_w-__m256i-a-__m256i-b">__m256i __lasx_xvpackod_w (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_54">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvpackod_w (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvpackod.w xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_54">Description</h3>
+<p>Collect and pack odd-positioned 32-bit elements in <code>a</code> and <code>b</code> and store <code>dst</code>.</p>
+<h3 id="operation_54">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = (i % 2 == 1) ? a.word[i] : b.word[i + 1];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_54">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvpackod_d-__m256i-a-__m256i-b">__m256i __lasx_xvpackod_d (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_55">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvpackod_d (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvpackod.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_55">Description</h3>
+<p>Collect and pack odd-positioned 64-bit elements in <code>a</code> and <code>b</code> and store <code>dst</code>.</p>
+<h3 id="operation_55">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = (i % 2 == 1) ? a.dword[i] : b.dword[i + 1];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_55">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvpickev_b-__m256i-a-__m256i-b">__m256i __lasx_xvpickev_b (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_56">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvpickev_b (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvpickev.b xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_56">Description</h3>
+<p>Pick even-positioned 8-bit elements in <code>b</code> first, then pick even-positioned 8-bit elements in <code>a</code>.</p>
+<h3 id="operation_56">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = (i &lt; 8) ? b.byte[i * 2] : a.byte[(i - 8) * 2];
+}
+for (int i = 16; i &lt; 32; i++) {
+  dst.byte[i] = (i &lt; 24) ? b.byte[(i - 8) * 2] : a.byte[(i - 16) * 2];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_56">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvpickev_h-__m256i-a-__m256i-b">__m256i __lasx_xvpickev_h (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_57">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvpickev_h (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvpickev.h xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_57">Description</h3>
+<p>Pick even-positioned 16-bit elements in <code>b</code> first, then pick even-positioned 16-bit elements in <code>a</code>.</p>
+<h3 id="operation_57">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = (i &lt; 4) ? b.half[i * 2] : a.half[(i - 4) * 2];
+}
+for (int i = 8; i &lt; 16; i++) {
+  dst.half[i] = (i &lt; 12) ? b.half[(i - 4) * 2] : a.half[(i - 8) * 2];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_57">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvpickev_w-__m256i-a-__m256i-b">__m256i __lasx_xvpickev_w (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_58">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvpickev_w (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvpickev.w xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_58">Description</h3>
+<p>Pick even-positioned 32-bit elements in <code>b</code> first, then pick even-positioned 32-bit elements in <code>a</code>.</p>
+<h3 id="operation_58">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = (i &lt; 2) ? b.word[i * 2] : a.word[(i - 2) * 2];
+}
+for (int i = 4; i &lt; 8; i++) {
+  dst.word[i] = (i &lt; 6) ? b.word[(i - 2) * 2] : a.word[(i - 4) * 2];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_58">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvpickev_d-__m256i-a-__m256i-b">__m256i __lasx_xvpickev_d (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_59">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvpickev_d (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvpickev.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_59">Description</h3>
+<p>Pick even-positioned 64-bit elements in <code>b</code> first, then pick even-positioned 64-bit elements in <code>a</code>.</p>
+<h3 id="operation_59">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = (i &lt; 1) ? b.dword[i * 2] : a.dword[(i - 1) * 2];
+}
+for (int i = 2; i &lt; 4; i++) {
+  dst.dword[i] = (i &lt; 3) ? b.dword[(i - 1) * 2] : a.dword[(i - 2) * 2];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_59">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvpickve_w-__m256i-a-imm0_7-imm">__m256i __lasx_xvpickve_w (__m256i a, imm0_7 imm)</h2>
+<h3 id="synopsis_60">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvpickve_w (__m256i a, imm0_7 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvpickve.w xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_60">Description</h3>
+<p>Copy one 32-bit lane from <code>a</code> specified by <code>imm</code> to the first lane of <code>dst</code>, and set the other lanes to zero.</p>
+<h3 id="operation_60">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = (i == 0) ? a.word[imm] : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_60">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvpickve_d-__m256i-a-imm0_3-imm">__m256i __lasx_xvpickve_d (__m256i a, imm0_3 imm)</h2>
+<h3 id="synopsis_61">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvpickve_d (__m256i a, imm0_3 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvpickve.d xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_61">Description</h3>
+<p>Copy one 64-bit lane from <code>a</code> specified by <code>imm</code> to the first lane of <code>dst</code>, and set the other lanes to zero.</p>
+<h3 id="operation_61">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = (i == 0) ? a.dword[imm] : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_61">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256-__lasx_xvpickve_w_f-__m256-a-imm0_7-imm">__m256 __lasx_xvpickve_w_f (__m256 a, imm0_7 imm)</h2>
+<h3 id="synopsis_62">Synopsis</h3>
+<pre><code class="language-c++">__m256 __lasx_xvpickve_w_f (__m256 a, imm0_7 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvpickve.w xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_62">Description</h3>
+<p>Copy one 32-bit lane from <code>a</code> specified by <code>imm</code> to the first lane of <code>dst</code>, and set the other lanes to zero.</p>
+<h3 id="operation_62">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = (i == 0) ? a.word[imm] : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_62">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256d-__lasx_xvpickve_d_f-__m256d-a-imm0_3-imm">__m256d __lasx_xvpickve_d_f (__m256d a, imm0_3 imm)</h2>
+<h3 id="synopsis_63">Synopsis</h3>
+<pre><code class="language-c++">__m256d __lasx_xvpickve_d_f (__m256d a, imm0_3 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvpickve.d xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_63">Description</h3>
+<p>Copy one 64-bit lane from <code>a</code> specified by <code>imm</code> to the first lane of <code>dst</code>, and set the other lanes to zero.</p>
+<h3 id="operation_63">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = (i == 0) ? a.dword[imm] : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_63">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="int-__lasx_xvpickve2gr_w-__m256i-a-imm0_7-idx">int __lasx_xvpickve2gr_w (__m256i a, imm0_7 idx)</h2>
+<h3 id="synopsis_64">Synopsis</h3>
+<pre><code class="language-c++">int __lasx_xvpickve2gr_w (__m256i a, imm0_7 idx)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvpickve2gr.w r, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_64">Description</h3>
+<p>Pick the <code>lane</code> specified by <code>idx</code> from <code>a</code> and store into <code>dst</code>.</p>
+<h3 id="operation_64">Operation</h3>
+<pre><code class="language-c++">dst = (s32)a.word[idx];
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_64">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>1</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="unsigned-int-__lasx_xvpickve2gr_wu-__m256i-a-imm0_7-idx">unsigned int __lasx_xvpickve2gr_wu (__m256i a, imm0_7 idx)</h2>
+<h3 id="synopsis_65">Synopsis</h3>
+<pre><code class="language-c++">unsigned int __lasx_xvpickve2gr_wu (__m256i a, imm0_7 idx)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvpickve2gr.wu r, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_65">Description</h3>
+<p>Pick the <code>lane</code> specified by <code>idx</code> from <code>a</code> and store into <code>dst</code>.</p>
+<h3 id="operation_65">Operation</h3>
+<pre><code class="language-c++">dst = (u32)a.word[idx];
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_65">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>1</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="long-int-__lasx_xvpickve2gr_d-__m256i-a-imm0_3-idx">long int __lasx_xvpickve2gr_d (__m256i a, imm0_3 idx)</h2>
+<h3 id="synopsis_66">Synopsis</h3>
+<pre><code class="language-c++">long int __lasx_xvpickve2gr_d (__m256i a, imm0_3 idx)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvpickve2gr.d r, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_66">Description</h3>
+<p>Pick the <code>lane</code> specified by <code>idx</code> from <code>a</code> and store into <code>dst</code>.</p>
+<h3 id="operation_66">Operation</h3>
+<pre><code class="language-c++">dst = (s64)a.dword[idx];
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_66">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>1</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="unsigned-long-int-__lasx_xvpickve2gr_du-__m256i-a-imm0_3-idx">unsigned long int __lasx_xvpickve2gr_du (__m256i a, imm0_3 idx)</h2>
+<h3 id="synopsis_67">Synopsis</h3>
+<pre><code class="language-c++">unsigned long int __lasx_xvpickve2gr_du (__m256i a, imm0_3 idx)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvpickve2gr.du r, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_67">Description</h3>
+<p>Pick the <code>lane</code> specified by <code>idx</code> from <code>a</code> and store into <code>dst</code>.</p>
+<h3 id="operation_67">Operation</h3>
+<pre><code class="language-c++">dst = (u64)a.dword[idx];
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_67">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>1</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvpickod_b-__m256i-a-__m256i-b">__m256i __lasx_xvpickod_b (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_68">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvpickod_b (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvpickod.b xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_68">Description</h3>
+<p>Pick odd-positioned 8-bit elements in <code>b</code> first, then pick odd-positioned 8-bit elements in <code>a</code>.</p>
+<h3 id="operation_68">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = (i &lt; 8) ? b.byte[i * 2 + 1] : a.byte[(i - 8) * 2 + 1];
+}
+for (int i = 16; i &lt; 32; i++) {
+  dst.byte[i] = (i &lt; 24) ? b.byte[(i - 8) * 2 + 1] : a.byte[(i - 16) * 2 + 1];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_68">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvpickod_h-__m256i-a-__m256i-b">__m256i __lasx_xvpickod_h (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_69">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvpickod_h (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvpickod.h xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_69">Description</h3>
+<p>Pick odd-positioned 16-bit elements in <code>b</code> first, then pick odd-positioned 16-bit elements in <code>a</code>.</p>
+<h3 id="operation_69">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = (i &lt; 4) ? b.half[i * 2 + 1] : a.half[(i - 4) * 2 + 1];
+}
+for (int i = 8; i &lt; 16; i++) {
+  dst.half[i] = (i &lt; 12) ? b.half[(i - 4) * 2 + 1] : a.half[(i - 8) * 2 + 1];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_69">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvpickod_w-__m256i-a-__m256i-b">__m256i __lasx_xvpickod_w (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_70">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvpickod_w (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvpickod.w xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_70">Description</h3>
+<p>Pick odd-positioned 32-bit elements in <code>b</code> first, then pick odd-positioned 32-bit elements in <code>a</code>.</p>
+<h3 id="operation_70">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = (i &lt; 2) ? b.word[i * 2 + 1] : a.word[(i - 2) * 2 + 1];
+}
+for (int i = 4; i &lt; 8; i++) {
+  dst.word[i] = (i &lt; 6) ? b.word[(i - 2) * 2 + 1] : a.word[(i - 4) * 2 + 1];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_70">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvpickod_d-__m256i-a-__m256i-b">__m256i __lasx_xvpickod_d (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_71">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvpickod_d (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvpickod.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_71">Description</h3>
+<p>Pick odd-positioned 64-bit elements in <code>b</code> first, then pick odd-positioned 64-bit elements in <code>a</code>.</p>
+<h3 id="operation_71">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = (i &lt; 1) ? b.dword[i * 2 + 1] : a.dword[(i - 1) * 2 + 1];
+}
+for (int i = 2; i &lt; 4; i++) {
+  dst.dword[i] = (i &lt; 3) ? b.dword[(i - 1) * 2 + 1] : a.dword[(i - 2) * 2 + 1];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_71">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvrepli_b-imm_n512_511-imm">__m256i __lasx_xvrepli_b (imm_n512_511 imm)</h2>
+<h3 id="synopsis_72">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvrepli_b (imm_n512_511 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvldi xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_72">Description</h3>
+<p>Repeat <code>imm</code> to fill whole vector.</p>
+<h3 id="operation_72">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 32; i++) {
+  dst.byte[i] = imm;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h2 id="__m256i-__lasx_xvrepli_h-imm_n512_511-imm">__m256i __lasx_xvrepli_h (imm_n512_511 imm)</h2>
+<h3 id="synopsis_73">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvrepli_h (imm_n512_511 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvldi xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_73">Description</h3>
+<p>Repeat <code>imm</code> to fill whole vector.</p>
+<h3 id="operation_73">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = imm;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h2 id="__m256i-__lasx_xvrepli_w-imm_n512_511-imm">__m256i __lasx_xvrepli_w (imm_n512_511 imm)</h2>
+<h3 id="synopsis_74">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvrepli_w (imm_n512_511 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvldi xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_74">Description</h3>
+<p>Repeat <code>imm</code> to fill whole vector.</p>
+<h3 id="operation_74">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = imm;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h2 id="__m256i-__lasx_xvrepli_d-imm_n512_511-imm">__m256i __lasx_xvrepli_d (imm_n512_511 imm)</h2>
+<h3 id="synopsis_75">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvrepli_d (imm_n512_511 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvldi xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_75">Description</h3>
+<p>Repeat <code>imm</code> to fill whole vector.</p>
+<h3 id="operation_75">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = imm;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h2 id="__m256i-__lasx_xvreplgr2vr_b-int-val">__m256i __lasx_xvreplgr2vr_b (int val)</h2>
+<h3 id="synopsis_76">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvreplgr2vr_b (int val)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvreplgr2vr.b xr, r
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_76">Description</h3>
+<p>Repeat <code>val</code> to whole vector.</p>
+<h3 id="operation_76">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 32; i++) {
+  dst.byte[i] = val;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_72">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>N/A</td>
+<td>1</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>N/A</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvreplgr2vr_h-int-val">__m256i __lasx_xvreplgr2vr_h (int val)</h2>
+<h3 id="synopsis_77">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvreplgr2vr_h (int val)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvreplgr2vr.h xr, r
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_77">Description</h3>
+<p>Repeat <code>val</code> to whole vector.</p>
+<h3 id="operation_77">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = val;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_73">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>N/A</td>
+<td>1</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>N/A</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvreplgr2vr_w-int-val">__m256i __lasx_xvreplgr2vr_w (int val)</h2>
+<h3 id="synopsis_78">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvreplgr2vr_w (int val)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvreplgr2vr.w xr, r
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_78">Description</h3>
+<p>Repeat <code>val</code> to whole vector.</p>
+<h3 id="operation_78">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = val;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_74">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>N/A</td>
+<td>1</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>N/A</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvreplgr2vr_d-long-int-val">__m256i __lasx_xvreplgr2vr_d (long int val)</h2>
+<h3 id="synopsis_79">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvreplgr2vr_d (long int val)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvreplgr2vr.d xr, r
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_79">Description</h3>
+<p>Repeat <code>val</code> to whole vector.</p>
+<h3 id="operation_79">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = val;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_75">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>N/A</td>
+<td>1</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>N/A</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvreplve_b-__m256i-a-int-idx">__m256i __lasx_xvreplve_b (__m256i a, int idx)</h2>
+<h3 id="synopsis_80">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvreplve_b (__m256i a, int idx)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvreplve.b xr, xr, r
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_80">Description</h3>
+<p>Repeat the element in lane <code>idx</code> of <code>a</code> to fill whole vector.</p>
+<h3 id="operation_80">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = a.byte[idx % 16];
+}
+for (int i = 16; i &lt; 32; i++) {
+  dst.byte[i] = a.byte[(idx % 16) + 16];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_76">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>1</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvreplve_h-__m256i-a-int-idx">__m256i __lasx_xvreplve_h (__m256i a, int idx)</h2>
+<h3 id="synopsis_81">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvreplve_h (__m256i a, int idx)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvreplve.h xr, xr, r
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_81">Description</h3>
+<p>Repeat the element in lane <code>idx</code> of <code>a</code> to fill whole vector.</p>
+<h3 id="operation_81">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = a.half[idx % 8];
+}
+for (int i = 8; i &lt; 16; i++) {
+  dst.half[i] = a.half[(idx % 8) + 8];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_77">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>1</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvreplve_w-__m256i-a-int-idx">__m256i __lasx_xvreplve_w (__m256i a, int idx)</h2>
+<h3 id="synopsis_82">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvreplve_w (__m256i a, int idx)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvreplve.w xr, xr, r
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_82">Description</h3>
+<p>Repeat the element in lane <code>idx</code> of <code>a</code> to fill whole vector.</p>
+<h3 id="operation_82">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = a.word[idx % 4];
+}
+for (int i = 4; i &lt; 8; i++) {
+  dst.word[i] = a.word[(idx % 4) + 4];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_78">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>1</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvreplve_d-__m256i-a-int-idx">__m256i __lasx_xvreplve_d (__m256i a, int idx)</h2>
+<h3 id="synopsis_83">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvreplve_d (__m256i a, int idx)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvreplve.d xr, xr, r
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_83">Description</h3>
+<p>Repeat the element in lane <code>idx</code> of <code>a</code> to fill whole vector.</p>
+<h3 id="operation_83">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = a.dword[idx % 2];
+}
+for (int i = 2; i &lt; 4; i++) {
+  dst.dword[i] = a.dword[(idx % 2) + 2];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_79">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>1</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvreplve0_b-__m256i-a">__m256i __lasx_xvreplve0_b (__m256i a)</h2>
+<h3 id="synopsis_84">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvreplve0_b (__m256i a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvreplve0.b xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_84">Description</h3>
+<p>Repeat the first 8-bit lane from <code>a</code> to all lanes of <code>dst</code>.</p>
+<h3 id="operation_84">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 32; i++) {
+  dst.byte[i] = a.byte[0];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_80">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvreplve0_h-__m256i-a">__m256i __lasx_xvreplve0_h (__m256i a)</h2>
+<h3 id="synopsis_85">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvreplve0_h (__m256i a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvreplve0.h xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_85">Description</h3>
+<p>Repeat the first 16-bit lane from <code>a</code> to all lanes of <code>dst</code>.</p>
+<h3 id="operation_85">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = a.half[0];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_81">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvreplve0_w-__m256i-a">__m256i __lasx_xvreplve0_w (__m256i a)</h2>
+<h3 id="synopsis_86">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvreplve0_w (__m256i a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvreplve0.w xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_86">Description</h3>
+<p>Repeat the first 32-bit lane from <code>a</code> to all lanes of <code>dst</code>.</p>
+<h3 id="operation_86">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = a.word[0];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_82">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvreplve0_d-__m256i-a">__m256i __lasx_xvreplve0_d (__m256i a)</h2>
+<h3 id="synopsis_87">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvreplve0_d (__m256i a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvreplve0.d xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_87">Description</h3>
+<p>Repeat the first 64-bit lane from <code>a</code> to all lanes of <code>dst</code>.</p>
+<h3 id="operation_87">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = a.dword[0];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_83">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvreplve0_q-__m256i-a">__m256i __lasx_xvreplve0_q (__m256i a)</h2>
+<h3 id="synopsis_88">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvreplve0_q (__m256i a)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvreplve0.q xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_88">Description</h3>
+<p>Repeat the first 128-bit lane from <code>a</code> to all lanes of <code>dst</code>.</p>
+<h3 id="operation_88">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.qword[i] = a.qword[0];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_84">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvrepl128vei_b-__m256i-a-imm0_15-idx">__m256i __lasx_xvrepl128vei_b (__m256i a, imm0_15 idx)</h2>
+<h3 id="synopsis_89">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvrepl128vei_b (__m256i a, imm0_15 idx)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvrepl128vei.b xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_89">Description</h3>
+<p>Repeat the element in lane <code>idx</code> of <code>a</code> to fill whole vector.</p>
+<h3 id="operation_89">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = a.byte[idx];
+}
+for (int i = 16; i &lt; 32; i++) {
+  dst.byte[i] = a.byte[idx + 16];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_85">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvrepl128vei_h-__m256i-a-imm0_7-idx">__m256i __lasx_xvrepl128vei_h (__m256i a, imm0_7 idx)</h2>
+<h3 id="synopsis_90">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvrepl128vei_h (__m256i a, imm0_7 idx)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvrepl128vei.h xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_90">Description</h3>
+<p>Repeat the element in lane <code>idx</code> of <code>a</code> to fill whole vector.</p>
+<h3 id="operation_90">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = a.half[idx];
+}
+for (int i = 8; i &lt; 16; i++) {
+  dst.half[i] = a.half[idx + 8];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_86">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvrepl128vei_w-__m256i-a-imm0_3-idx">__m256i __lasx_xvrepl128vei_w (__m256i a, imm0_3 idx)</h2>
+<h3 id="synopsis_91">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvrepl128vei_w (__m256i a, imm0_3 idx)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvrepl128vei.w xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_91">Description</h3>
+<p>Repeat the element in lane <code>idx</code> of <code>a</code> to fill whole vector.</p>
+<h3 id="operation_91">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = a.word[idx];
+}
+for (int i = 4; i &lt; 8; i++) {
+  dst.word[i] = a.word[idx + 4];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_87">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvrepl128vei_d-__m256i-a-imm0_1-idx">__m256i __lasx_xvrepl128vei_d (__m256i a, imm0_1 idx)</h2>
+<h3 id="synopsis_92">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvrepl128vei_d (__m256i a, imm0_1 idx)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvrepl128vei.d xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_92">Description</h3>
+<p>Repeat the element in lane <code>idx</code> of <code>a</code> to fill whole vector.</p>
+<h3 id="operation_92">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = a.dword[idx];
+}
+for (int i = 2; i &lt; 4; i++) {
+  dst.dword[i] = a.dword[idx + 2];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_88">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsat_b-__m256i-a-imm0_7-imm">__m256i __lasx_xvsat_b (__m256i a, imm0_7 imm)</h2>
+<h3 id="synopsis_93">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsat_b (__m256i a, imm0_7 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsat.b xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_93">Description</h3>
+<p>Clamp signed 8-bit elements in <code>a</code> to range specified by <code>imm</code>.</p>
+<h3 id="operation_93">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 32; i++) {
+  dst.byte[i] = clamp&lt;s8&gt;(a.byte[i], -(1 &lt;&lt; imm), (1 &lt;&lt; imm) - 1);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_89">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsat_bu-__m256i-a-imm0_7-imm">__m256i __lasx_xvsat_bu (__m256i a, imm0_7 imm)</h2>
+<h3 id="synopsis_94">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsat_bu (__m256i a, imm0_7 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsat.bu xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_94">Description</h3>
+<p>Clamp unsigned 8-bit elements in <code>a</code> to range specified by <code>imm</code>.</p>
+<h3 id="operation_94">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 32; i++) {
+  dst.byte[i] = clamp&lt;u8&gt;(a.byte[i], 0, (1 &lt;&lt; (imm + 1)) - 1);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_90">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsat_h-__m256i-a-imm0_15-imm">__m256i __lasx_xvsat_h (__m256i a, imm0_15 imm)</h2>
+<h3 id="synopsis_95">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsat_h (__m256i a, imm0_15 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsat.h xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_95">Description</h3>
+<p>Clamp signed 16-bit elements in <code>a</code> to range specified by <code>imm</code>.</p>
+<h3 id="operation_95">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = clamp&lt;s16&gt;(a.half[i], -(1 &lt;&lt; imm), (1 &lt;&lt; imm) - 1);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_91">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsat_hu-__m256i-a-imm0_15-imm">__m256i __lasx_xvsat_hu (__m256i a, imm0_15 imm)</h2>
+<h3 id="synopsis_96">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsat_hu (__m256i a, imm0_15 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsat.hu xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_96">Description</h3>
+<p>Clamp unsigned 16-bit elements in <code>a</code> to range specified by <code>imm</code>.</p>
+<h3 id="operation_96">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = clamp&lt;u16&gt;(a.half[i], 0, (1 &lt;&lt; (imm + 1)) - 1);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_92">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsat_w-__m256i-a-imm0_31-imm">__m256i __lasx_xvsat_w (__m256i a, imm0_31 imm)</h2>
+<h3 id="synopsis_97">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsat_w (__m256i a, imm0_31 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsat.w xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_97">Description</h3>
+<p>Clamp signed 32-bit elements in <code>a</code> to range specified by <code>imm</code>.</p>
+<h3 id="operation_97">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = clamp&lt;s32&gt;(a.word[i], -(1 &lt;&lt; imm), (1 &lt;&lt; imm) - 1);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_93">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsat_wu-__m256i-a-imm0_31-imm">__m256i __lasx_xvsat_wu (__m256i a, imm0_31 imm)</h2>
+<h3 id="synopsis_98">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsat_wu (__m256i a, imm0_31 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsat.wu xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_98">Description</h3>
+<p>Clamp unsigned 32-bit elements in <code>a</code> to range specified by <code>imm</code>.</p>
+<h3 id="operation_98">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = clamp&lt;u32&gt;(a.word[i], 0, (1 &lt;&lt; (imm + 1)) - 1);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_94">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsat_d-__m256i-a-imm0_63-imm">__m256i __lasx_xvsat_d (__m256i a, imm0_63 imm)</h2>
+<h3 id="synopsis_99">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsat_d (__m256i a, imm0_63 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsat.d xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_99">Description</h3>
+<p>Clamp signed 64-bit elements in <code>a</code> to range specified by <code>imm</code>.</p>
+<h3 id="operation_99">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = clamp&lt;s64&gt;(a.dword[i], -(1 &lt;&lt; imm), (1 &lt;&lt; imm) - 1);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_95">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsat_du-__m256i-a-imm0_63-imm">__m256i __lasx_xvsat_du (__m256i a, imm0_63 imm)</h2>
+<h3 id="synopsis_100">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsat_du (__m256i a, imm0_63 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsat.du xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_100">Description</h3>
+<p>Clamp unsigned 64-bit elements in <code>a</code> to range specified by <code>imm</code>.</p>
+<h3 id="operation_100">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = clamp&lt;u64&gt;(a.dword[i], 0, (1 &lt;&lt; (imm + 1)) - 1);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_96">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsigncov_b-__m256i-a-__m256i-b">__m256i __lasx_xvsigncov_b (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_101">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsigncov_b (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsigncov.b xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_101">Description</h3>
+<p>If the 8-bit element in <code>a</code> equals to zero, set the result to zero. If the signed 8-bit element in <code>a</code> is positive, copy element in <code>b</code> to result. Otherwise, copy negated element in <code>b</code> to result. If <code>a</code> and <code>b</code> are the same vectors, it is equivalent to computing absolute value.</p>
+<h3 id="operation_101">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 32; i++) {
+  dst.byte[i] =
+      (a.byte[i] == 0) ? 0 : ((s8)a.byte[i] &gt; 0 ? b.byte[i] : -b.byte[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_97">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsigncov_h-__m256i-a-__m256i-b">__m256i __lasx_xvsigncov_h (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_102">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsigncov_h (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsigncov.h xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_102">Description</h3>
+<p>If the 16-bit element in <code>a</code> equals to zero, set the result to zero. If the signed 16-bit element in <code>a</code> is positive, copy element in <code>b</code> to result. Otherwise, copy negated element in <code>b</code> to result. If <code>a</code> and <code>b</code> are the same vectors, it is equivalent to computing absolute value.</p>
+<h3 id="operation_102">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] =
+      (a.half[i] == 0) ? 0 : ((s16)a.half[i] &gt; 0 ? b.half[i] : -b.half[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_98">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsigncov_w-__m256i-a-__m256i-b">__m256i __lasx_xvsigncov_w (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_103">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsigncov_w (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsigncov.w xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_103">Description</h3>
+<p>If the 32-bit element in <code>a</code> equals to zero, set the result to zero. If the signed 32-bit element in <code>a</code> is positive, copy element in <code>b</code> to result. Otherwise, copy negated element in <code>b</code> to result. If <code>a</code> and <code>b</code> are the same vectors, it is equivalent to computing absolute value.</p>
+<h3 id="operation_103">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] =
+      (a.word[i] == 0) ? 0 : ((s32)a.word[i] &gt; 0 ? b.word[i] : -b.word[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_99">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsigncov_d-__m256i-a-__m256i-b">__m256i __lasx_xvsigncov_d (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_104">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsigncov_d (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsigncov.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_104">Description</h3>
+<p>If the 64-bit element in <code>a</code> equals to zero, set the result to zero. If the signed 64-bit element in <code>a</code> is positive, copy element in <code>b</code> to result. Otherwise, copy negated element in <code>b</code> to result. If <code>a</code> and <code>b</code> are the same vectors, it is equivalent to computing absolute value.</p>
+<h3 id="operation_104">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] =
+      (a.dword[i] == 0) ? 0 : ((s64)a.dword[i] &gt; 0 ? b.dword[i] : -b.dword[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_100">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvldi-imm_n1024_1023-imm">__m256i __lasx_xvldi (imm_n1024_1023 imm)</h2>
+<h3 id="synopsis_105">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvldi (imm_n1024_1023 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvldi xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_105">Description</h3>
+<p>Initialize <code>dst</code> using predefined patterns:</p>
+<ul>
+<li><code>imm[12:10]=0b000</code>: broadcast <code>imm[7:0]</code> as 8-bit elements to all lanes</li>
+<li><code>imm[12:10]=0b001</code>: broadcast sign-extended <code>imm[9:0]</code> as 16-bit elements to all lanes</li>
+<li><code>imm[12:10]=0b010</code>: broadcast sign-extended <code>imm[9:0]</code> as 32-bit elements to all lanes</li>
+<li><code>imm[12:10]=0b011</code>: broadcast sign-extended <code>imm[9:0]</code> as 64-bit elements to all lanes</li>
+<li><code>imm[12:8]=0b10000</code>: broadcast <code>imm[7:0]</code> as 32-bit elements to all lanes</li>
+<li><code>imm[12:8]=0b10001</code>: broadcast <code>imm[7:0] &lt;&lt; 8</code> as 32-bit elements to all lanes</li>
+<li><code>imm[12:8]=0b10010</code>: broadcast <code>imm[7:0] &lt;&lt; 16</code> as 32-bit elements to all lanes</li>
+<li><code>imm[12:8]=0b10011</code>: broadcast <code>imm[7:0] &lt;&lt; 24</code> as 32-bit elements to all lanes</li>
+<li><code>imm[12:8]=0b10100</code>: broadcast <code>imm[7:0]</code> as 16-bit elements to all lanes</li>
+<li><code>imm[12:8]=0b10101</code>: broadcast <code>imm[7:0] &lt;&lt; 8</code> as 16-bit elements to all lanes</li>
+<li><code>imm[12:8]=0b10110</code>: broadcast <code>(imm[7:0] &lt;&lt; 8) | 0xFF</code> as 32-bit elements to all lanes</li>
+<li><code>imm[12:8]=0b10111</code>: broadcast <code>(imm[7:0] &lt;&lt; 16) | 0xFFFF</code> as 32-bit elements to all lanes</li>
+<li><code>imm[12:8]=0b11000</code>: broadcast <code>imm[7:0]</code> as 8-bit elements to all lanes</li>
+<li><code>imm[12:8]=0b11001</code>: repeat each bit of <code>imm[7:0]</code> eight times, and broadcast the result as 64-bit elements to all lanes</li>
+<li><code>imm[12:8]=0b11010</code>: broadcast <code>(imm[7] &lt;&lt; 31) | ((1-imm[6]) &lt;&lt; 30) | ((imm[6] * 0x1F) &lt;&lt; 25) | (imm[5:0] &lt;&lt; 19)</code> as 32-bit elements to all lanes</li>
+<li><code>imm[12:8]=0b11011</code>: broadcast <code>(imm[7] &lt;&lt; 31) | ((1-imm[6]) &lt;&lt; 30) | ((imm[6] * 0x1F) &lt;&lt; 25) | (imm[5:0] &lt;&lt; 19)</code> as 64-bit elements to all lanes</li>
+<li><code>imm[12:8]=0b11100</code>: broadcast <code>(imm[7] &lt;&lt; 63) | ((1-imm[6]) &lt;&lt; 62) | ((imm[6] * 0xFF) &lt;&lt; 54) | (imm[5:0] &lt;&lt; 48)</code> as 64-bit elements to all lanes</li>
+</ul>
+<h3 id="operation_105">Operation</h3>
+<pre><code class="language-c++">u64 imm12_10 = (imm &gt;&gt; 10) &amp; 0b111;
+u64 imm12_8 = (imm &gt;&gt; 8) &amp; 0b11111;
+u64 imm9_0 = imm &amp; 0x3FF;
+s64 simm9_0 = ((s64)imm9_0 &lt;&lt; 54) &gt;&gt; 54;
+u64 imm7_0 = imm &amp; 0xFF;
+u64 imm7 = (imm &gt;&gt; 7) &amp; 0x1;
+u64 imm6 = (imm &gt;&gt; 6) &amp; 0x1;
+u64 imm5 = (imm &gt;&gt; 5) &amp; 0x1;
+u64 imm5_0 = imm &amp; 0x3F;
+u64 imm4 = (imm &gt;&gt; 4) &amp; 0x1;
+u64 imm3 = (imm &gt;&gt; 3) &amp; 0x1;
+u64 imm2 = (imm &gt;&gt; 2) &amp; 0x1;
+u64 imm1 = (imm &gt;&gt; 1) &amp; 0x1;
+u64 imm0 = imm &amp; 0x1;
+
+u64 broadcast_value;
+u64 broadcast_width;
+if (imm12_10 == 0b000) {
+  broadcast_value = imm7_0;
+  broadcast_width = 8;
+} else if (imm12_10 == 0b001) {
+  broadcast_value = simm9_0;
+  broadcast_width = 16;
+} else if (imm12_10 == 0b010) {
+  broadcast_value = simm9_0;
+  broadcast_width = 32;
+} else if (imm12_10 == 0b011) {
+  broadcast_value = simm9_0;
+  broadcast_width = 64;
+} else if (imm12_8 == 0b10000) {
+  broadcast_value = imm7_0;
+  broadcast_width = 32;
+} else if (imm12_8 == 0b10001) {
+  broadcast_value = imm7_0 &lt;&lt; 8;
+  broadcast_width = 32;
+} else if (imm12_8 == 0b10010) {
+  broadcast_value = imm7_0 &lt;&lt; 16;
+  broadcast_width = 32;
+} else if (imm12_8 == 0b10011) {
+  broadcast_value = imm7_0 &lt;&lt; 24;
+  broadcast_width = 32;
+} else if (imm12_8 == 0b10100) {
+  broadcast_value = imm7_0;
+  broadcast_width = 16;
+} else if (imm12_8 == 0b10101) {
+  broadcast_value = imm7_0 &lt;&lt; 8;
+  broadcast_width = 16;
+} else if (imm12_8 == 0b10110) {
+  broadcast_value = (imm7_0 &lt;&lt; 8) | 0xFF;
+  broadcast_width = 32;
+} else if (imm12_8 == 0b10111) {
+  broadcast_value = (imm7_0 &lt;&lt; 16) | 0xFFFF;
+  broadcast_width = 32;
+} else if (imm12_8 == 0b11000) {
+  broadcast_value = imm7_0;
+  broadcast_width = 8;
+} else if (imm12_8 == 0b11001) {
+  broadcast_value = imm0 * 0xFF + imm1 * 0xFF00 + imm2 * 0xFF0000 +
+                    imm3 * 0xFF000000 + imm4 * 0xFF00000000 +
+                    imm5 * 0xFF0000000000 + imm6 * 0xFF000000000000 +
+                    imm7 * 0xFF00000000000000;
+  broadcast_width = 64;
+} else if (imm12_8 == 0b11010) {
+  broadcast_value = (imm7 &lt;&lt; 31) | ((1 - imm6) &lt;&lt; 30) | ((imm6 * 0x1F) &lt;&lt; 25) |
+                    (imm5_0 &lt;&lt; 19);
+  broadcast_width = 32;
+} else if (imm12_8 == 0b11011) {
+  broadcast_value = (imm7 &lt;&lt; 31) | ((1 - imm6) &lt;&lt; 30) | ((imm6 * 0x1F) &lt;&lt; 25) |
+                    (imm5_0 &lt;&lt; 19);
+  broadcast_width = 64;
+} else if (imm12_8 == 0b11100) {
+  broadcast_value = (imm7 &lt;&lt; 63) | ((1 - imm6) &lt;&lt; 62) | ((imm6 * 0xFF) &lt;&lt; 54) |
+                    (imm5_0 &lt;&lt; 48);
+  broadcast_width = 64;
+}
+
+if (broadcast_width == 8) {
+  for (int i = 0; i &lt; 32; i++) {
+    dst.byte[i] = broadcast_value;
+  }
+} else if (broadcast_width == 16) {
+  for (int i = 0; i &lt; 16; i++) {
+    dst.half[i] = broadcast_value;
+  }
+} else if (broadcast_width == 32) {
+  for (int i = 0; i &lt; 8; i++) {
+    dst.word[i] = broadcast_value;
+  }
+} else if (broadcast_width == 64) {
+  for (int i = 0; i &lt; 4; i++) {
+    dst.dword[i] = broadcast_value;
+  }
+}
+</code></pre>
+              
+            </div>
+          </div><footer>
+    <div class="rst-footer-buttons" role="navigation" aria-label="Footer Navigation">
+        <a href="../memory/" class="btn btn-neutral float-left" title="Memory Load & Store"><span class="icon icon-circle-arrow-left"></span> Previous</a>
+        <a href="../permutation/" class="btn btn-neutral float-right" title="Permutation">Next <span class="icon icon-circle-arrow-right"></span></a>
+    </div>
+
+  <hr/>
+
+  <div role="contentinfo">
+    <!-- Copyright etc -->
+      <p>Copyright &copy; 2023 Jiajie Chen</p>
+  </div>
+
+  Built with <a href="https://www.mkdocs.org/">MkDocs</a> using a <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
+</footer>
+          
+        </div>
+      </div>
+
+    </section>
+
+  </div>
+
+  <div class="rst-versions" role="note" aria-label="Versions">
+  <span class="rst-current-version" data-toggle="rst-current-version">
+    
+    
+      <span><a href="../memory/" style="color: #fcfcfc">&laquo; Previous</a></span>
+    
+    
+      <span><a href="../permutation/" style="color: #fcfcfc">Next &raquo;</a></span>
+    
+  </span>
+</div>
+    <script src="../../js/jquery-3.6.0.min.js"></script>
+    <script>var base_url = "../..";</script>
+    <script src="../../js/theme_extra.js"></script>
+    <script src="../../js/theme.js"></script>
+    <script>
+        jQuery(function () {
+            SphinxRtdTheme.Navigation.enable(true);
+        });
+    </script>
+
+</body>
+</html>
diff --git a/lasx/permutation/index.html b/lasx/permutation/index.html
new file mode 100644
index 00000000..26b7fbfe
--- /dev/null
+++ b/lasx/permutation/index.html
@@ -0,0 +1,411 @@
+<!DOCTYPE html>
+<html class="writer-html5" lang="en" >
+<head>
+    <meta charset="utf-8" />
+    <meta http-equiv="X-UA-Compatible" content="IE=edge" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" /><link rel="canonical" href="https://jia.je/unofficial-loongarch-intrinsics-guide/lasx/permutation/" />
+      <link rel="shortcut icon" href="../../img/favicon.ico" />
+    <title>Permutation - Unofficial LoongArch Intrinsics Guide</title>
+    <link rel="stylesheet" href="../../css/theme.css" />
+    <link rel="stylesheet" href="../../css/theme_extra.css" />
+        <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.8.0/styles/github.min.css" />
+        <link href="../../main.css" rel="stylesheet" />
+    
+      <script>
+        // Current page data
+        var mkdocs_page_name = "Permutation";
+        var mkdocs_page_input_path = "lasx/permutation.md";
+        var mkdocs_page_url = "/unofficial-loongarch-intrinsics-guide/lasx/permutation/";
+      </script>
+    
+    <!--[if lt IE 9]>
+      <script src="../../js/html5shiv.min.js"></script>
+    <![endif]-->
+      <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.8.0/highlight.min.js"></script>
+      <script>hljs.highlightAll();</script> 
+</head>
+
+<body class="wy-body-for-nav" role="document">
+
+  <div class="wy-grid-for-nav">
+    <nav data-toggle="wy-nav-shift" class="wy-nav-side stickynav">
+    <div class="wy-side-scroll">
+      <div class="wy-side-nav-search">
+          <a href="../.." class="icon icon-home"> Unofficial LoongArch Intrinsics Guide
+        </a>
+      </div>
+
+      <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../..">Unofficial LoongArch Intrinsics Guide</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../latency_throughput/">Latency and Throughput of Instructions</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../migrating_avx/">Migrating from AVX to LASX</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../migrating_sse/">Migrating from SSE to LSX</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../viewer/">Browse All Intrinsics</a>
+                </li>
+              </ul>
+              <p class="caption"><span class="caption-text">Lasx</span></p>
+              <ul class="current">
+                  <li class="toctree-l1"><a class="reference internal" href="../bitwise_operations/">Bitwise Operations</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../branch/">Branch</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_comparison/">Floating Point Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_computation/">Floating Point Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_conversion/">Floating Point Conversion</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_misc/">Floating Point Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../fma/">Fused Multiply-Add</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../integer_comparison/">Integer Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../integer_computation/">Integer Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../logical/">Logical</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../memory/">Memory Load & Store</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../misc/">Misc</a>
+                  </li>
+                  <li class="toctree-l1 current"><a class="reference internal current" href="./">Permutation</a>
+    <ul class="current">
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvpermi_w-__m256i-a-__m256i-b-imm0_255-imm">__m256i __lasx_xvpermi_w (__m256i a, __m256i b, imm0_255 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvpermi_d-__m256i-a-imm0_255-imm">__m256i __lasx_xvpermi_d (__m256i a, imm0_255 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_1">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_1">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_1">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_1">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvpermi_q-__m256i-a-__m256i-b-imm0_255-imm">__m256i __lasx_xvpermi_q (__m256i a, __m256i b, imm0_255 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_2">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_2">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_2">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_2">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvperm_w-__m256i-a-__m256i-b">__m256i __lasx_xvperm_w (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_3">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_3">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_3">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_3">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    </ul>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../shift/">Shift</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../shuffling/">Shuffling</a>
+                  </li>
+              </ul>
+              <p class="caption"><span class="caption-text">Lsx</span></p>
+              <ul>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/bitwise_operations/">Bitwise Operations</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/branch/">Branch</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/float_comparison/">Floating Point Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/float_computation/">Floating Point Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/float_conversion/">Floating Point Conversion</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/float_misc/">Floating Point Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/fma/">Fused Multiply-Add</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/integer_comparison/">Integer Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/integer_computation/">Integer Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/logical/">Logical</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/memory/">Memory Load & Store</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/misc/">Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/permutation/">Permutation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/shift/">Shift</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/shuffling/">Shuffling</a>
+                  </li>
+              </ul>
+      </div>
+    </div>
+    </nav>
+
+    <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
+      <nav class="wy-nav-top" role="navigation" aria-label="Mobile navigation menu">
+          <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
+          <a href="../..">Unofficial LoongArch Intrinsics Guide</a>
+        
+      </nav>
+      <div class="wy-nav-content">
+        <div class="rst-content"><div role="navigation" aria-label="breadcrumbs navigation">
+  <ul class="wy-breadcrumbs">
+    <li><a href="../.." class="icon icon-home" aria-label="Docs"></a></li>
+          <li class="breadcrumb-item">Lasx</li>
+      <li class="breadcrumb-item active">Permutation</li>
+    <li class="wy-breadcrumbs-aside">
+    </li>
+  </ul>
+  <hr/>
+</div>
+          <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
+            <div class="section" itemprop="articleBody">
+              
+                <h1 id="permutation">Permutation</h1>
+<h2 id="__m256i-__lasx_xvpermi_w-__m256i-a-__m256i-b-imm0_255-imm">__m256i __lasx_xvpermi_w (__m256i a, __m256i b, imm0_255 imm)</h2>
+<h3 id="synopsis">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvpermi_w (__m256i a, __m256i b, imm0_255 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvpermi.w xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description">Description</h3>
+<p>Permute words from <code>a</code> and <code>b</code> with indices recorded in <code>imm</code> and store into <code>dst</code>.</p>
+<h3 id="operation">Operation</h3>
+<pre><code class="language-c++">dst.word[0] = b.word[imm &amp; 0x3];
+dst.word[1] = b.word[(imm &gt;&gt; 2) &amp; 0x3];
+dst.word[2] = a.word[(imm &gt;&gt; 4) &amp; 0x3];
+dst.word[3] = a.word[(imm &gt;&gt; 6) &amp; 0x3];
+dst.word[4] = b.word[4 + (imm &amp; 0x3)];
+dst.word[5] = b.word[4 + ((imm &gt;&gt; 2) &amp; 0x3)];
+dst.word[6] = a.word[4 + ((imm &gt;&gt; 4) &amp; 0x3)];
+dst.word[7] = a.word[4 + ((imm &gt;&gt; 6) &amp; 0x3)];
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvpermi_d-__m256i-a-imm0_255-imm">__m256i __lasx_xvpermi_d (__m256i a, imm0_255 imm)</h2>
+<h3 id="synopsis_1">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvpermi_d (__m256i a, imm0_255 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvpermi.d xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_1">Description</h3>
+<p>Permute double words from <code>a</code> and <code>b</code> with indices recorded in <code>imm</code> and store into <code>dst</code>.</p>
+<h3 id="operation_1">Operation</h3>
+<pre><code class="language-c++">dst.dword[0] = a.dword[imm &amp; 0x3];
+dst.dword[1] = a.dword[(imm &gt;&gt; 2) &amp; 0x3];
+dst.dword[2] = a.dword[(imm &gt;&gt; 4) &amp; 0x3];
+dst.dword[3] = a.dword[(imm &gt;&gt; 6) &amp; 0x3];
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_1">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvpermi_q-__m256i-a-__m256i-b-imm0_255-imm">__m256i __lasx_xvpermi_q (__m256i a, __m256i b, imm0_255 imm)</h2>
+<h3 id="synopsis_2">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvpermi_q (__m256i a, __m256i b, imm0_255 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvpermi.d xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_2">Description</h3>
+<p>Permute quad words from <code>a</code> and <code>b</code> with indices recorded in <code>imm</code> and store into <code>dst</code>.</p>
+<h3 id="operation_2">Operation</h3>
+<pre><code class="language-c++">if ((imm &amp; 0x4) &amp;&amp; MACHINE_3C5000) {
+  // Caveat: observed in 3C5000
+  dst.qword[0] = 0;
+} else {
+  dst.qword[0] = (imm &amp; 2) ? a.qword[imm &amp; 0x1] : b.qword[imm &amp; 0x1];
+}
+if ((imm &amp; 0x80) &amp;&amp; MACHINE_3C5000) {
+  // Caveat: observed in 3C5000
+  dst.qword[1] = 0;
+} else {
+  dst.qword[1] =
+      (imm &amp; 0x20) ? a.qword[(imm &gt;&gt; 4) &amp; 0x1] : b.qword[(imm &gt;&gt; 4) &amp; 0x1];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_2">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvperm_w-__m256i-a-__m256i-b">__m256i __lasx_xvperm_w (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_3">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvperm_w (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvperm.w xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_3">Description</h3>
+<p>Permute words from <code>a</code> with indices recorded in <code>b</code> and store into <code>dst</code>.</p>
+<h3 id="operation_3">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = a.word[b.word[i] % 0x8];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_3">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+              
+            </div>
+          </div><footer>
+    <div class="rst-footer-buttons" role="navigation" aria-label="Footer Navigation">
+        <a href="../misc/" class="btn btn-neutral float-left" title="Misc"><span class="icon icon-circle-arrow-left"></span> Previous</a>
+        <a href="../shift/" class="btn btn-neutral float-right" title="Shift">Next <span class="icon icon-circle-arrow-right"></span></a>
+    </div>
+
+  <hr/>
+
+  <div role="contentinfo">
+    <!-- Copyright etc -->
+      <p>Copyright &copy; 2023 Jiajie Chen</p>
+  </div>
+
+  Built with <a href="https://www.mkdocs.org/">MkDocs</a> using a <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
+</footer>
+          
+        </div>
+      </div>
+
+    </section>
+
+  </div>
+
+  <div class="rst-versions" role="note" aria-label="Versions">
+  <span class="rst-current-version" data-toggle="rst-current-version">
+    
+    
+      <span><a href="../misc/" style="color: #fcfcfc">&laquo; Previous</a></span>
+    
+    
+      <span><a href="../shift/" style="color: #fcfcfc">Next &raquo;</a></span>
+    
+  </span>
+</div>
+    <script src="../../js/jquery-3.6.0.min.js"></script>
+    <script>var base_url = "../..";</script>
+    <script src="../../js/theme_extra.js"></script>
+    <script src="../../js/theme.js"></script>
+    <script>
+        jQuery(function () {
+            SphinxRtdTheme.Navigation.enable(true);
+        });
+    </script>
+
+</body>
+</html>
diff --git a/lasx/shift/index.html b/lasx/shift/index.html
new file mode 100644
index 00000000..b0788626
--- /dev/null
+++ b/lasx/shift/index.html
@@ -0,0 +1,8908 @@
+<!DOCTYPE html>
+<html class="writer-html5" lang="en" >
+<head>
+    <meta charset="utf-8" />
+    <meta http-equiv="X-UA-Compatible" content="IE=edge" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" /><link rel="canonical" href="https://jia.je/unofficial-loongarch-intrinsics-guide/lasx/shift/" />
+      <link rel="shortcut icon" href="../../img/favicon.ico" />
+    <title>Shift - Unofficial LoongArch Intrinsics Guide</title>
+    <link rel="stylesheet" href="../../css/theme.css" />
+    <link rel="stylesheet" href="../../css/theme_extra.css" />
+        <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.8.0/styles/github.min.css" />
+        <link href="../../main.css" rel="stylesheet" />
+    
+      <script>
+        // Current page data
+        var mkdocs_page_name = "Shift";
+        var mkdocs_page_input_path = "lasx/shift.md";
+        var mkdocs_page_url = "/unofficial-loongarch-intrinsics-guide/lasx/shift/";
+      </script>
+    
+    <!--[if lt IE 9]>
+      <script src="../../js/html5shiv.min.js"></script>
+    <![endif]-->
+      <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.8.0/highlight.min.js"></script>
+      <script>hljs.highlightAll();</script> 
+</head>
+
+<body class="wy-body-for-nav" role="document">
+
+  <div class="wy-grid-for-nav">
+    <nav data-toggle="wy-nav-shift" class="wy-nav-side stickynav">
+    <div class="wy-side-scroll">
+      <div class="wy-side-nav-search">
+          <a href="../.." class="icon icon-home"> Unofficial LoongArch Intrinsics Guide
+        </a>
+      </div>
+
+      <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../..">Unofficial LoongArch Intrinsics Guide</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../latency_throughput/">Latency and Throughput of Instructions</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../migrating_avx/">Migrating from AVX to LASX</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../migrating_sse/">Migrating from SSE to LSX</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../viewer/">Browse All Intrinsics</a>
+                </li>
+              </ul>
+              <p class="caption"><span class="caption-text">Lasx</span></p>
+              <ul class="current">
+                  <li class="toctree-l1"><a class="reference internal" href="../bitwise_operations/">Bitwise Operations</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../branch/">Branch</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_comparison/">Floating Point Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_computation/">Floating Point Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_conversion/">Floating Point Conversion</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_misc/">Floating Point Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../fma/">Fused Multiply-Add</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../integer_comparison/">Integer Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../integer_computation/">Integer Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../logical/">Logical</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../memory/">Memory Load & Store</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../misc/">Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../permutation/">Permutation</a>
+                  </li>
+                  <li class="toctree-l1 current"><a class="reference internal current" href="./">Shift</a>
+    <ul class="current">
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvbsll_v-__m256i-a-imm0_31-imm">__m256i __lasx_xvbsll_v (__m256i a, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvbsrl_v-__m256i-a-imm0_31-imm">__m256i __lasx_xvbsrl_v (__m256i a, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_1">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_1">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_1">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_1">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsll_b-__m256i-a-__m256i-b">__m256i __lasx_xvsll_b (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_2">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_2">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_2">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_2">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsll_h-__m256i-a-__m256i-b">__m256i __lasx_xvsll_h (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_3">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_3">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_3">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_3">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsll_w-__m256i-a-__m256i-b">__m256i __lasx_xvsll_w (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_4">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_4">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_4">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_4">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsll_d-__m256i-a-__m256i-b">__m256i __lasx_xvsll_d (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_5">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_5">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_5">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_5">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvslli_b-__m256i-a-imm0_7-imm">__m256i __lasx_xvslli_b (__m256i a, imm0_7 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_6">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_6">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_6">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_6">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvslli_h-__m256i-a-imm0_15-imm">__m256i __lasx_xvslli_h (__m256i a, imm0_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_7">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_7">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_7">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_7">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvslli_w-__m256i-a-imm0_31-imm">__m256i __lasx_xvslli_w (__m256i a, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_8">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_8">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_8">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_8">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvslli_d-__m256i-a-imm0_63-imm">__m256i __lasx_xvslli_d (__m256i a, imm0_63 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_9">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_9">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_9">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_9">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsllwil_h_b-__m256i-a-imm0_7-imm">__m256i __lasx_xvsllwil_h_b (__m256i a, imm0_7 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_10">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_10">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_10">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_10">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsllwil_hu_bu-__m256i-a-imm0_7-imm">__m256i __lasx_xvsllwil_hu_bu (__m256i a, imm0_7 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_11">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_11">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_11">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_11">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsllwil_w_h-__m256i-a-imm0_15-imm">__m256i __lasx_xvsllwil_w_h (__m256i a, imm0_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_12">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_12">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_12">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_12">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsllwil_wu_hu-__m256i-a-imm0_15-imm">__m256i __lasx_xvsllwil_wu_hu (__m256i a, imm0_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_13">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_13">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_13">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_13">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsllwil_d_w-__m256i-a-imm0_31-imm">__m256i __lasx_xvsllwil_d_w (__m256i a, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_14">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_14">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_14">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_14">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsllwil_du_wu-__m256i-a-imm0_31-imm">__m256i __lasx_xvsllwil_du_wu (__m256i a, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_15">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_15">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_15">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_15">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsra_b-__m256i-a-__m256i-b">__m256i __lasx_xvsra_b (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_16">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_16">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_16">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_16">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsra_h-__m256i-a-__m256i-b">__m256i __lasx_xvsra_h (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_17">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_17">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_17">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_17">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsra_w-__m256i-a-__m256i-b">__m256i __lasx_xvsra_w (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_18">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_18">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_18">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_18">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsra_d-__m256i-a-__m256i-b">__m256i __lasx_xvsra_d (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_19">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_19">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_19">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_19">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsrai_b-__m256i-a-imm0_7-imm">__m256i __lasx_xvsrai_b (__m256i a, imm0_7 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_20">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_20">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_20">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_20">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsrai_h-__m256i-a-imm0_15-imm">__m256i __lasx_xvsrai_h (__m256i a, imm0_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_21">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_21">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_21">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_21">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsrai_w-__m256i-a-imm0_31-imm">__m256i __lasx_xvsrai_w (__m256i a, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_22">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_22">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_22">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_22">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsrai_d-__m256i-a-imm0_63-imm">__m256i __lasx_xvsrai_d (__m256i a, imm0_63 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_23">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_23">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_23">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_23">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsran_b_h-__m256i-a-__m256i-b">__m256i __lasx_xvsran_b_h (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_24">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_24">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_24">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_24">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsran_h_w-__m256i-a-__m256i-b">__m256i __lasx_xvsran_h_w (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_25">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_25">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_25">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_25">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsran_w_d-__m256i-a-__m256i-b">__m256i __lasx_xvsran_w_d (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_26">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_26">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_26">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_26">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsrani_b_h-__m256i-a-__m256i-b-imm0_15-imm">__m256i __lasx_xvsrani_b_h (__m256i a, __m256i b, imm0_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_27">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_27">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_27">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_27">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsrani_h_w-__m256i-a-__m256i-b-imm0_31-imm">__m256i __lasx_xvsrani_h_w (__m256i a, __m256i b, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_28">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_28">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_28">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_28">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsrani_w_d-__m256i-a-__m256i-b-imm0_63-imm">__m256i __lasx_xvsrani_w_d (__m256i a, __m256i b, imm0_63 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_29">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_29">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_29">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_29">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsrani_d_q-__m256i-a-__m256i-b-imm0_127-imm">__m256i __lasx_xvsrani_d_q (__m256i a, __m256i b, imm0_127 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_30">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_30">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_30">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_30">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsrar_b-__m256i-a-__m256i-b">__m256i __lasx_xvsrar_b (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_31">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_31">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_31">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_31">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsrar_h-__m256i-a-__m256i-b">__m256i __lasx_xvsrar_h (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_32">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_32">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_32">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_32">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsrar_w-__m256i-a-__m256i-b">__m256i __lasx_xvsrar_w (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_33">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_33">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_33">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_33">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsrar_d-__m256i-a-__m256i-b">__m256i __lasx_xvsrar_d (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_34">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_34">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_34">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_34">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsrari_b-__m256i-a-imm0_7-imm">__m256i __lasx_xvsrari_b (__m256i a, imm0_7 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_35">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_35">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_35">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_35">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsrari_h-__m256i-a-imm0_15-imm">__m256i __lasx_xvsrari_h (__m256i a, imm0_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_36">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_36">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_36">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_36">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsrari_w-__m256i-a-imm0_31-imm">__m256i __lasx_xvsrari_w (__m256i a, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_37">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_37">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_37">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_37">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsrari_d-__m256i-a-imm0_63-imm">__m256i __lasx_xvsrari_d (__m256i a, imm0_63 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_38">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_38">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_38">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_38">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsrarn_b_h-__m256i-a-__m256i-b">__m256i __lasx_xvsrarn_b_h (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_39">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_39">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_39">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_39">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsrarn_h_w-__m256i-a-__m256i-b">__m256i __lasx_xvsrarn_h_w (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_40">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_40">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_40">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_40">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsrarn_w_d-__m256i-a-__m256i-b">__m256i __lasx_xvsrarn_w_d (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_41">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_41">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_41">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_41">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsrarni_b_h-__m256i-a-__m256i-b-imm0_15-imm">__m256i __lasx_xvsrarni_b_h (__m256i a, __m256i b, imm0_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_42">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_42">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_42">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_42">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsrarni_h_w-__m256i-a-__m256i-b-imm0_31-imm">__m256i __lasx_xvsrarni_h_w (__m256i a, __m256i b, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_43">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_43">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_43">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_43">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsrarni_w_d-__m256i-a-__m256i-b-imm0_63-imm">__m256i __lasx_xvsrarni_w_d (__m256i a, __m256i b, imm0_63 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_44">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_44">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_44">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_44">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsrarni_d_q-__m256i-a-__m256i-b-imm0_127-imm">__m256i __lasx_xvsrarni_d_q (__m256i a, __m256i b, imm0_127 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_45">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_45">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_45">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_45">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsrl_b-__m256i-a-__m256i-b">__m256i __lasx_xvsrl_b (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_46">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_46">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_46">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_46">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsrl_h-__m256i-a-__m256i-b">__m256i __lasx_xvsrl_h (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_47">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_47">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_47">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_47">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsrl_w-__m256i-a-__m256i-b">__m256i __lasx_xvsrl_w (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_48">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_48">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_48">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_48">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsrl_d-__m256i-a-__m256i-b">__m256i __lasx_xvsrl_d (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_49">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_49">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_49">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_49">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsrli_b-__m256i-a-imm0_7-imm">__m256i __lasx_xvsrli_b (__m256i a, imm0_7 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_50">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_50">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_50">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_50">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsrli_h-__m256i-a-imm0_15-imm">__m256i __lasx_xvsrli_h (__m256i a, imm0_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_51">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_51">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_51">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_51">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsrli_w-__m256i-a-imm0_31-imm">__m256i __lasx_xvsrli_w (__m256i a, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_52">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_52">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_52">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_52">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsrli_d-__m256i-a-imm0_63-imm">__m256i __lasx_xvsrli_d (__m256i a, imm0_63 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_53">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_53">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_53">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_53">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsrln_b_h-__m256i-a-__m256i-b">__m256i __lasx_xvsrln_b_h (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_54">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_54">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_54">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_54">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsrln_h_w-__m256i-a-__m256i-b">__m256i __lasx_xvsrln_h_w (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_55">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_55">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_55">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_55">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsrln_w_d-__m256i-a-__m256i-b">__m256i __lasx_xvsrln_w_d (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_56">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_56">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_56">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_56">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsrlni_b_h-__m256i-a-__m256i-b-imm0_15-imm">__m256i __lasx_xvsrlni_b_h (__m256i a, __m256i b, imm0_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_57">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_57">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_57">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_57">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsrlni_h_w-__m256i-a-__m256i-b-imm0_31-imm">__m256i __lasx_xvsrlni_h_w (__m256i a, __m256i b, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_58">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_58">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_58">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_58">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsrlni_w_d-__m256i-a-__m256i-b-imm0_63-imm">__m256i __lasx_xvsrlni_w_d (__m256i a, __m256i b, imm0_63 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_59">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_59">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_59">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_59">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsrlni_d_q-__m256i-a-__m256i-b-imm0_127-imm">__m256i __lasx_xvsrlni_d_q (__m256i a, __m256i b, imm0_127 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_60">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_60">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_60">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_60">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsrlr_b-__m256i-a-__m256i-b">__m256i __lasx_xvsrlr_b (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_61">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_61">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_61">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_61">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsrlr_h-__m256i-a-__m256i-b">__m256i __lasx_xvsrlr_h (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_62">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_62">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_62">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_62">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsrlr_w-__m256i-a-__m256i-b">__m256i __lasx_xvsrlr_w (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_63">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_63">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_63">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_63">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsrlr_d-__m256i-a-__m256i-b">__m256i __lasx_xvsrlr_d (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_64">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_64">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_64">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_64">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsrlri_b-__m256i-a-imm0_7-imm">__m256i __lasx_xvsrlri_b (__m256i a, imm0_7 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_65">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_65">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_65">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_65">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsrlri_h-__m256i-a-imm0_15-imm">__m256i __lasx_xvsrlri_h (__m256i a, imm0_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_66">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_66">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_66">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_66">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsrlri_w-__m256i-a-imm0_31-imm">__m256i __lasx_xvsrlri_w (__m256i a, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_67">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_67">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_67">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_67">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsrlri_d-__m256i-a-imm0_63-imm">__m256i __lasx_xvsrlri_d (__m256i a, imm0_63 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_68">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_68">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_68">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_68">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsrlrn_b_h-__m256i-a-__m256i-b">__m256i __lasx_xvsrlrn_b_h (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_69">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_69">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_69">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_69">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsrlrn_h_w-__m256i-a-__m256i-b">__m256i __lasx_xvsrlrn_h_w (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_70">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_70">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_70">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_70">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsrlrn_w_d-__m256i-a-__m256i-b">__m256i __lasx_xvsrlrn_w_d (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_71">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_71">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_71">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_71">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsrlrni_b_h-__m256i-a-__m256i-b-imm0_15-imm">__m256i __lasx_xvsrlrni_b_h (__m256i a, __m256i b, imm0_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_72">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_72">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_72">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_72">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsrlrni_h_w-__m256i-a-__m256i-b-imm0_31-imm">__m256i __lasx_xvsrlrni_h_w (__m256i a, __m256i b, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_73">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_73">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_73">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_73">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsrlrni_w_d-__m256i-a-__m256i-b-imm0_63-imm">__m256i __lasx_xvsrlrni_w_d (__m256i a, __m256i b, imm0_63 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_74">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_74">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_74">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_74">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvsrlrni_d_q-__m256i-a-__m256i-b-imm0_127-imm">__m256i __lasx_xvsrlrni_d_q (__m256i a, __m256i b, imm0_127 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_75">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_75">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_75">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_75">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvssran_b_h-__m256i-a-__m256i-b">__m256i __lasx_xvssran_b_h (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_76">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_76">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_76">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_76">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvssran_bu_h-__m256i-a-__m256i-b">__m256i __lasx_xvssran_bu_h (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_77">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_77">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_77">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_77">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvssran_h_w-__m256i-a-__m256i-b">__m256i __lasx_xvssran_h_w (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_78">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_78">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_78">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_78">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvssran_hu_w-__m256i-a-__m256i-b">__m256i __lasx_xvssran_hu_w (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_79">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_79">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_79">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_79">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvssran_w_d-__m256i-a-__m256i-b">__m256i __lasx_xvssran_w_d (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_80">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_80">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_80">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_80">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvssran_wu_d-__m256i-a-__m256i-b">__m256i __lasx_xvssran_wu_d (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_81">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_81">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_81">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_81">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvssrani_b_h-__m256i-a-__m256i-b-imm0_15-imm">__m256i __lasx_xvssrani_b_h (__m256i a, __m256i b, imm0_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_82">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_82">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_82">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_82">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvssrani_bu_h-__m256i-a-__m256i-b-imm0_15-imm">__m256i __lasx_xvssrani_bu_h (__m256i a, __m256i b, imm0_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_83">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_83">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_83">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_83">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvssrani_h_w-__m256i-a-__m256i-b-imm0_31-imm">__m256i __lasx_xvssrani_h_w (__m256i a, __m256i b, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_84">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_84">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_84">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_84">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvssrani_hu_w-__m256i-a-__m256i-b-imm0_31-imm">__m256i __lasx_xvssrani_hu_w (__m256i a, __m256i b, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_85">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_85">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_85">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_85">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvssrani_w_d-__m256i-a-__m256i-b-imm0_63-imm">__m256i __lasx_xvssrani_w_d (__m256i a, __m256i b, imm0_63 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_86">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_86">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_86">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_86">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvssrani_wu_d-__m256i-a-__m256i-b-imm0_63-imm">__m256i __lasx_xvssrani_wu_d (__m256i a, __m256i b, imm0_63 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_87">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_87">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_87">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_87">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvssrani_d_q-__m256i-a-__m256i-b-imm0_127-imm">__m256i __lasx_xvssrani_d_q (__m256i a, __m256i b, imm0_127 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_88">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_88">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_88">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_88">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvssrani_du_q-__m256i-a-__m256i-b-imm0_127-imm">__m256i __lasx_xvssrani_du_q (__m256i a, __m256i b, imm0_127 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_89">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_89">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_89">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_89">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvssrarn_b_h-__m256i-a-__m256i-b">__m256i __lasx_xvssrarn_b_h (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_90">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_90">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_90">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_90">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvssrarn_bu_h-__m256i-a-__m256i-b">__m256i __lasx_xvssrarn_bu_h (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_91">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_91">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_91">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_91">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvssrarn_h_w-__m256i-a-__m256i-b">__m256i __lasx_xvssrarn_h_w (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_92">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_92">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_92">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_92">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvssrarn_hu_w-__m256i-a-__m256i-b">__m256i __lasx_xvssrarn_hu_w (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_93">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_93">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_93">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_93">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvssrarn_w_d-__m256i-a-__m256i-b">__m256i __lasx_xvssrarn_w_d (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_94">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_94">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_94">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_94">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvssrarn_wu_d-__m256i-a-__m256i-b">__m256i __lasx_xvssrarn_wu_d (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_95">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_95">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_95">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_95">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvssrarni_b_h-__m256i-a-__m256i-b-imm0_15-imm">__m256i __lasx_xvssrarni_b_h (__m256i a, __m256i b, imm0_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_96">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_96">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_96">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_96">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvssrarni_bu_h-__m256i-a-__m256i-b-imm0_15-imm">__m256i __lasx_xvssrarni_bu_h (__m256i a, __m256i b, imm0_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_97">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_97">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_97">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_97">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvssrarni_h_w-__m256i-a-__m256i-b-imm0_31-imm">__m256i __lasx_xvssrarni_h_w (__m256i a, __m256i b, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_98">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_98">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_98">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_98">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvssrarni_hu_w-__m256i-a-__m256i-b-imm0_31-imm">__m256i __lasx_xvssrarni_hu_w (__m256i a, __m256i b, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_99">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_99">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_99">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_99">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvssrarni_w_d-__m256i-a-__m256i-b-imm0_63-imm">__m256i __lasx_xvssrarni_w_d (__m256i a, __m256i b, imm0_63 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_100">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_100">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_100">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_100">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvssrarni_wu_d-__m256i-a-__m256i-b-imm0_63-imm">__m256i __lasx_xvssrarni_wu_d (__m256i a, __m256i b, imm0_63 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_101">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_101">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_101">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_101">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvssrarni_d_q-__m256i-a-__m256i-b-imm0_127-imm">__m256i __lasx_xvssrarni_d_q (__m256i a, __m256i b, imm0_127 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_102">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_102">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_102">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_102">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvssrarni_du_q-__m256i-a-__m256i-b-imm0_127-imm">__m256i __lasx_xvssrarni_du_q (__m256i a, __m256i b, imm0_127 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_103">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_103">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_103">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_103">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvssrln_b_h-__m256i-a-__m256i-b">__m256i __lasx_xvssrln_b_h (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_104">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_104">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_104">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_104">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvssrln_bu_h-__m256i-a-__m256i-b">__m256i __lasx_xvssrln_bu_h (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_105">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_105">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_105">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_105">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvssrln_h_w-__m256i-a-__m256i-b">__m256i __lasx_xvssrln_h_w (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_106">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_106">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_106">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_106">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvssrln_hu_w-__m256i-a-__m256i-b">__m256i __lasx_xvssrln_hu_w (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_107">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_107">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_107">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_107">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvssrln_w_d-__m256i-a-__m256i-b">__m256i __lasx_xvssrln_w_d (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_108">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_108">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_108">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_108">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvssrln_wu_d-__m256i-a-__m256i-b">__m256i __lasx_xvssrln_wu_d (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_109">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_109">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_109">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_109">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvssrlni_b_h-__m256i-a-__m256i-b-imm0_15-imm">__m256i __lasx_xvssrlni_b_h (__m256i a, __m256i b, imm0_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_110">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_110">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_110">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_110">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvssrlni_bu_h-__m256i-a-__m256i-b-imm0_15-imm">__m256i __lasx_xvssrlni_bu_h (__m256i a, __m256i b, imm0_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_111">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_111">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_111">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_111">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvssrlni_h_w-__m256i-a-__m256i-b-imm0_31-imm">__m256i __lasx_xvssrlni_h_w (__m256i a, __m256i b, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_112">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_112">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_112">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_112">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvssrlni_hu_w-__m256i-a-__m256i-b-imm0_31-imm">__m256i __lasx_xvssrlni_hu_w (__m256i a, __m256i b, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_113">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_113">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_113">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_113">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvssrlni_w_d-__m256i-a-__m256i-b-imm0_63-imm">__m256i __lasx_xvssrlni_w_d (__m256i a, __m256i b, imm0_63 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_114">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_114">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_114">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_114">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvssrlni_wu_d-__m256i-a-__m256i-b-imm0_63-imm">__m256i __lasx_xvssrlni_wu_d (__m256i a, __m256i b, imm0_63 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_115">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_115">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_115">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_115">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvssrlni_d_q-__m256i-a-__m256i-b-imm0_127-imm">__m256i __lasx_xvssrlni_d_q (__m256i a, __m256i b, imm0_127 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_116">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_116">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_116">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_116">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvssrlni_du_q-__m256i-a-__m256i-b-imm0_127-imm">__m256i __lasx_xvssrlni_du_q (__m256i a, __m256i b, imm0_127 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_117">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_117">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_117">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_117">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvssrlrn_b_h-__m256i-a-__m256i-b">__m256i __lasx_xvssrlrn_b_h (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_118">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_118">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_118">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_118">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvssrlrn_bu_h-__m256i-a-__m256i-b">__m256i __lasx_xvssrlrn_bu_h (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_119">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_119">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_119">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_119">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvssrlrn_h_w-__m256i-a-__m256i-b">__m256i __lasx_xvssrlrn_h_w (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_120">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_120">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_120">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_120">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvssrlrn_hu_w-__m256i-a-__m256i-b">__m256i __lasx_xvssrlrn_hu_w (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_121">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_121">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_121">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_121">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvssrlrn_w_d-__m256i-a-__m256i-b">__m256i __lasx_xvssrlrn_w_d (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_122">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_122">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_122">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_122">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvssrlrn_wu_d-__m256i-a-__m256i-b">__m256i __lasx_xvssrlrn_wu_d (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_123">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_123">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_123">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_123">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvssrlrni_b_h-__m256i-a-__m256i-b-imm0_15-imm">__m256i __lasx_xvssrlrni_b_h (__m256i a, __m256i b, imm0_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_124">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_124">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_124">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_124">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvssrlrni_bu_h-__m256i-a-__m256i-b-imm0_15-imm">__m256i __lasx_xvssrlrni_bu_h (__m256i a, __m256i b, imm0_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_125">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_125">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_125">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_125">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvssrlrni_h_w-__m256i-a-__m256i-b-imm0_31-imm">__m256i __lasx_xvssrlrni_h_w (__m256i a, __m256i b, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_126">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_126">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_126">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_126">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvssrlrni_hu_w-__m256i-a-__m256i-b-imm0_31-imm">__m256i __lasx_xvssrlrni_hu_w (__m256i a, __m256i b, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_127">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_127">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_127">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_127">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvssrlrni_w_d-__m256i-a-__m256i-b-imm0_63-imm">__m256i __lasx_xvssrlrni_w_d (__m256i a, __m256i b, imm0_63 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_128">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_128">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_128">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_128">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvssrlrni_wu_d-__m256i-a-__m256i-b-imm0_63-imm">__m256i __lasx_xvssrlrni_wu_d (__m256i a, __m256i b, imm0_63 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_129">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_129">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_129">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_129">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvssrlrni_d_q-__m256i-a-__m256i-b-imm0_127-imm">__m256i __lasx_xvssrlrni_d_q (__m256i a, __m256i b, imm0_127 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_130">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_130">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_130">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_130">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvssrlrni_du_q-__m256i-a-__m256i-b-imm0_127-imm">__m256i __lasx_xvssrlrni_du_q (__m256i a, __m256i b, imm0_127 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_131">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_131">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_131">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_131">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvrotr_b-__m256i-a-__m256i-b">__m256i __lasx_xvrotr_b (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_132">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_132">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_132">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_132">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvrotr_h-__m256i-a-__m256i-b">__m256i __lasx_xvrotr_h (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_133">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_133">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_133">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_133">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvrotr_w-__m256i-a-__m256i-b">__m256i __lasx_xvrotr_w (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_134">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_134">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_134">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_134">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvrotr_d-__m256i-a-__m256i-b">__m256i __lasx_xvrotr_d (__m256i a, __m256i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_135">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_135">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_135">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_135">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvrotri_b-__m256i-a-imm0_7-imm">__m256i __lasx_xvrotri_b (__m256i a, imm0_7 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_136">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_136">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_136">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_136">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvrotri_h-__m256i-a-imm0_15-imm">__m256i __lasx_xvrotri_h (__m256i a, imm0_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_137">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_137">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_137">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_137">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvrotri_w-__m256i-a-imm0_31-imm">__m256i __lasx_xvrotri_w (__m256i a, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_138">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_138">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_138">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_138">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvrotri_d-__m256i-a-imm0_63-imm">__m256i __lasx_xvrotri_d (__m256i a, imm0_63 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_139">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_139">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_139">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_139">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    </ul>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../shuffling/">Shuffling</a>
+                  </li>
+              </ul>
+              <p class="caption"><span class="caption-text">Lsx</span></p>
+              <ul>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/bitwise_operations/">Bitwise Operations</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/branch/">Branch</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/float_comparison/">Floating Point Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/float_computation/">Floating Point Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/float_conversion/">Floating Point Conversion</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/float_misc/">Floating Point Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/fma/">Fused Multiply-Add</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/integer_comparison/">Integer Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/integer_computation/">Integer Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/logical/">Logical</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/memory/">Memory Load & Store</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/misc/">Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/permutation/">Permutation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/shift/">Shift</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/shuffling/">Shuffling</a>
+                  </li>
+              </ul>
+      </div>
+    </div>
+    </nav>
+
+    <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
+      <nav class="wy-nav-top" role="navigation" aria-label="Mobile navigation menu">
+          <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
+          <a href="../..">Unofficial LoongArch Intrinsics Guide</a>
+        
+      </nav>
+      <div class="wy-nav-content">
+        <div class="rst-content"><div role="navigation" aria-label="breadcrumbs navigation">
+  <ul class="wy-breadcrumbs">
+    <li><a href="../.." class="icon icon-home" aria-label="Docs"></a></li>
+          <li class="breadcrumb-item">Lasx</li>
+      <li class="breadcrumb-item active">Shift</li>
+    <li class="wy-breadcrumbs-aside">
+    </li>
+  </ul>
+  <hr/>
+</div>
+          <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
+            <div class="section" itemprop="articleBody">
+              
+                <h1 id="shift">Shift</h1>
+<h2 id="__m256i-__lasx_xvbsll_v-__m256i-a-imm0_31-imm">__m256i __lasx_xvbsll_v (__m256i a, imm0_31 imm)</h2>
+<h3 id="synopsis">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvbsll_v (__m256i a, imm0_31 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvbsll.v xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description">Description</h3>
+<p>Compute whole vector <code>a</code> shifted left by <code>imm * 8</code> bits.</p>
+<h3 id="operation">Operation</h3>
+<pre><code class="language-c++">int shift = (imm * 8) % 128;
+dst.qword[0] = (u128)a.qword[0] &lt;&lt; shift;
+dst.qword[1] = (u128)a.qword[1] &lt;&lt; shift;
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvbsrl_v-__m256i-a-imm0_31-imm">__m256i __lasx_xvbsrl_v (__m256i a, imm0_31 imm)</h2>
+<h3 id="synopsis_1">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvbsrl_v (__m256i a, imm0_31 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvbsrl.v xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_1">Description</h3>
+<p>Compute whole vector <code>a</code> shifted right by <code>imm * 8</code> bits.</p>
+<h3 id="operation_1">Operation</h3>
+<pre><code class="language-c++">int shift = (imm * 8) % 128;
+dst.qword[0] = (u128)a.qword[0] &gt;&gt; shift;
+dst.qword[1] = (u128)a.qword[1] &gt;&gt; shift;
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_1">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsll_b-__m256i-a-__m256i-b">__m256i __lasx_xvsll_b (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_2">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsll_b (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsll.b xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_2">Description</h3>
+<p>Logical left shift the unsigned 8-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_2">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 32; i++) {
+  dst.byte[i] = a.byte[i] &lt;&lt; (b.byte[i] &amp; 0x7);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_2">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsll_h-__m256i-a-__m256i-b">__m256i __lasx_xvsll_h (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_3">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsll_h (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsll.h xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_3">Description</h3>
+<p>Logical left shift the unsigned 16-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_3">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = a.half[i] &lt;&lt; (b.half[i] &amp; 0xf);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_3">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsll_w-__m256i-a-__m256i-b">__m256i __lasx_xvsll_w (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_4">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsll_w (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsll.w xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_4">Description</h3>
+<p>Logical left shift the unsigned 32-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_4">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = a.word[i] &lt;&lt; (b.word[i] &amp; 0x1f);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_4">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsll_d-__m256i-a-__m256i-b">__m256i __lasx_xvsll_d (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_5">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsll_d (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsll.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_5">Description</h3>
+<p>Logical left shift the unsigned 64-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_5">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = a.dword[i] &lt;&lt; (b.dword[i] &amp; 0x3f);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_5">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvslli_b-__m256i-a-imm0_7-imm">__m256i __lasx_xvslli_b (__m256i a, imm0_7 imm)</h2>
+<h3 id="synopsis_6">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvslli_b (__m256i a, imm0_7 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvslli.b xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_6">Description</h3>
+<p>Logical left shift the unsigned 8-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_6">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 32; i++) {
+  dst.byte[i] = a.byte[i] &lt;&lt; imm;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_6">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvslli_h-__m256i-a-imm0_15-imm">__m256i __lasx_xvslli_h (__m256i a, imm0_15 imm)</h2>
+<h3 id="synopsis_7">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvslli_h (__m256i a, imm0_15 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvslli.h xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_7">Description</h3>
+<p>Logical left shift the unsigned 16-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_7">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = a.half[i] &lt;&lt; imm;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_7">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvslli_w-__m256i-a-imm0_31-imm">__m256i __lasx_xvslli_w (__m256i a, imm0_31 imm)</h2>
+<h3 id="synopsis_8">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvslli_w (__m256i a, imm0_31 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvslli.w xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_8">Description</h3>
+<p>Logical left shift the unsigned 32-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_8">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = a.word[i] &lt;&lt; imm;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_8">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvslli_d-__m256i-a-imm0_63-imm">__m256i __lasx_xvslli_d (__m256i a, imm0_63 imm)</h2>
+<h3 id="synopsis_9">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvslli_d (__m256i a, imm0_63 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvslli.d xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_9">Description</h3>
+<p>Logical left shift the unsigned 64-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_9">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = a.dword[i] &lt;&lt; imm;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_9">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsllwil_h_b-__m256i-a-imm0_7-imm">__m256i __lasx_xvsllwil_h_b (__m256i a, imm0_7 imm)</h2>
+<h3 id="synopsis_10">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsllwil_h_b (__m256i a, imm0_7 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsllwil.h.b xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_10">Description</h3>
+<p>Extend and shift signed 8-bit elements in <code>a</code> by <code>imm</code> to signed 16-bit result.</p>
+<h3 id="operation_10">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = (s16)(s8)a.byte[i] &lt;&lt; imm;
+}
+for (int i = 8; i &lt; 16; i++) {
+  dst.half[i] = (s16)(s8)a.byte[i + 8] &lt;&lt; imm;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_10">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsllwil_hu_bu-__m256i-a-imm0_7-imm">__m256i __lasx_xvsllwil_hu_bu (__m256i a, imm0_7 imm)</h2>
+<h3 id="synopsis_11">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsllwil_hu_bu (__m256i a, imm0_7 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsllwil.hu.bu xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_11">Description</h3>
+<p>Extend and shift unsigned 8-bit elements in <code>a</code> by <code>imm</code> to unsigned 16-bit result.</p>
+<h3 id="operation_11">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = (u16)(u8)a.byte[i] &lt;&lt; imm;
+}
+for (int i = 8; i &lt; 16; i++) {
+  dst.half[i] = (u16)(u8)a.byte[i + 8] &lt;&lt; imm;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_11">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsllwil_w_h-__m256i-a-imm0_15-imm">__m256i __lasx_xvsllwil_w_h (__m256i a, imm0_15 imm)</h2>
+<h3 id="synopsis_12">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsllwil_w_h (__m256i a, imm0_15 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsllwil.w.h xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_12">Description</h3>
+<p>Extend and shift signed 16-bit elements in <code>a</code> by <code>imm</code> to signed 32-bit result.</p>
+<h3 id="operation_12">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = (s32)(s16)a.half[i] &lt;&lt; imm;
+}
+for (int i = 4; i &lt; 8; i++) {
+  dst.word[i] = (s32)(s16)a.half[i + 4] &lt;&lt; imm;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_12">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsllwil_wu_hu-__m256i-a-imm0_15-imm">__m256i __lasx_xvsllwil_wu_hu (__m256i a, imm0_15 imm)</h2>
+<h3 id="synopsis_13">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsllwil_wu_hu (__m256i a, imm0_15 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsllwil.wu.hu xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_13">Description</h3>
+<p>Extend and shift unsigned 16-bit elements in <code>a</code> by <code>imm</code> to unsigned 32-bit result.</p>
+<h3 id="operation_13">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = (u32)(u16)a.half[i] &lt;&lt; imm;
+}
+for (int i = 4; i &lt; 8; i++) {
+  dst.word[i] = (u32)(u16)a.half[i + 4] &lt;&lt; imm;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_13">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsllwil_d_w-__m256i-a-imm0_31-imm">__m256i __lasx_xvsllwil_d_w (__m256i a, imm0_31 imm)</h2>
+<h3 id="synopsis_14">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsllwil_d_w (__m256i a, imm0_31 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsllwil.d.w xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_14">Description</h3>
+<p>Extend and shift signed 32-bit elements in <code>a</code> by <code>imm</code> to signed 64-bit result.</p>
+<h3 id="operation_14">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = (s64)(s32)a.word[i] &lt;&lt; imm;
+}
+for (int i = 2; i &lt; 4; i++) {
+  dst.dword[i] = (s64)(s32)a.word[i + 2] &lt;&lt; imm;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_14">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsllwil_du_wu-__m256i-a-imm0_31-imm">__m256i __lasx_xvsllwil_du_wu (__m256i a, imm0_31 imm)</h2>
+<h3 id="synopsis_15">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsllwil_du_wu (__m256i a, imm0_31 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsllwil.du.wu xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_15">Description</h3>
+<p>Extend and shift unsigned 32-bit elements in <code>a</code> by <code>imm</code> to unsigned 64-bit result.</p>
+<h3 id="operation_15">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = (u64)(u32)a.word[i] &lt;&lt; imm;
+}
+for (int i = 2; i &lt; 4; i++) {
+  dst.dword[i] = (u64)(u32)a.word[i + 2] &lt;&lt; imm;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_15">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsra_b-__m256i-a-__m256i-b">__m256i __lasx_xvsra_b (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_16">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsra_b (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsra.b xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_16">Description</h3>
+<p>Arithmetic right shift the signed 8-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_16">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 32; i++) {
+  dst.byte[i] = ((s8)a.byte[i]) &gt;&gt; (b.byte[i] &amp; 0x7);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_16">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsra_h-__m256i-a-__m256i-b">__m256i __lasx_xvsra_h (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_17">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsra_h (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsra.h xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_17">Description</h3>
+<p>Arithmetic right shift the signed 16-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_17">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = ((s16)a.half[i]) &gt;&gt; (b.half[i] &amp; 0xf);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_17">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsra_w-__m256i-a-__m256i-b">__m256i __lasx_xvsra_w (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_18">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsra_w (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsra.w xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_18">Description</h3>
+<p>Arithmetic right shift the signed 32-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_18">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = ((s32)a.word[i]) &gt;&gt; (b.word[i] &amp; 0x1f);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_18">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsra_d-__m256i-a-__m256i-b">__m256i __lasx_xvsra_d (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_19">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsra_d (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsra.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_19">Description</h3>
+<p>Arithmetic right shift the signed 64-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_19">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = ((s64)a.dword[i]) &gt;&gt; (b.dword[i] &amp; 0x3f);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_19">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsrai_b-__m256i-a-imm0_7-imm">__m256i __lasx_xvsrai_b (__m256i a, imm0_7 imm)</h2>
+<h3 id="synopsis_20">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsrai_b (__m256i a, imm0_7 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsrai.b xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_20">Description</h3>
+<p>Arithmetic right shift the signed 8-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_20">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 32; i++) {
+  dst.byte[i] = ((s8)a.byte[i]) &gt;&gt; imm;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_20">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsrai_h-__m256i-a-imm0_15-imm">__m256i __lasx_xvsrai_h (__m256i a, imm0_15 imm)</h2>
+<h3 id="synopsis_21">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsrai_h (__m256i a, imm0_15 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsrai.h xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_21">Description</h3>
+<p>Arithmetic right shift the signed 16-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_21">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = ((s16)a.half[i]) &gt;&gt; imm;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_21">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsrai_w-__m256i-a-imm0_31-imm">__m256i __lasx_xvsrai_w (__m256i a, imm0_31 imm)</h2>
+<h3 id="synopsis_22">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsrai_w (__m256i a, imm0_31 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsrai.w xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_22">Description</h3>
+<p>Arithmetic right shift the signed 32-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_22">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = ((s32)a.word[i]) &gt;&gt; imm;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_22">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsrai_d-__m256i-a-imm0_63-imm">__m256i __lasx_xvsrai_d (__m256i a, imm0_63 imm)</h2>
+<h3 id="synopsis_23">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsrai_d (__m256i a, imm0_63 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsrai.d xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_23">Description</h3>
+<p>Arithmetic right shift the signed 64-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_23">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = ((s64)a.dword[i]) &gt;&gt; imm;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_23">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsran_b_h-__m256i-a-__m256i-b">__m256i __lasx_xvsran_b_h (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_24">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsran_b_h (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsran.b.h xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_24">Description</h3>
+<p>Arithmetic right shift the signed 16-bit elements in <code>a</code> by elements in <code>b</code>, truncate to 8-bit and store the result to <code>dst</code>.</p>
+<h3 id="operation_24">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = (i &lt; 8) ? (s8)((s16)a.half[i] &gt;&gt; (b.half[i] &amp; 15)) : 0;
+}
+for (int i = 16; i &lt; 32; i++) {
+  dst.byte[i] = (i &lt; 24) ? (s8)((s16)a.half[i - 8] &gt;&gt; (b.half[i - 8] &amp; 15)) : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_24">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsran_h_w-__m256i-a-__m256i-b">__m256i __lasx_xvsran_h_w (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_25">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsran_h_w (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsran.h.w xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_25">Description</h3>
+<p>Arithmetic right shift the signed 32-bit elements in <code>a</code> by elements in <code>b</code>, truncate to 16-bit and store the result to <code>dst</code>.</p>
+<h3 id="operation_25">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = (i &lt; 4) ? (s16)((s32)a.word[i] &gt;&gt; (b.word[i] &amp; 31)) : 0;
+}
+for (int i = 8; i &lt; 16; i++) {
+  dst.half[i] =
+      (i &lt; 12) ? (s16)((s32)a.word[i - 4] &gt;&gt; (b.word[i - 4] &amp; 31)) : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_25">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsran_w_d-__m256i-a-__m256i-b">__m256i __lasx_xvsran_w_d (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_26">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsran_w_d (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsran.w.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_26">Description</h3>
+<p>Arithmetic right shift the signed 64-bit elements in <code>a</code> by elements in <code>b</code>, truncate to 32-bit and store the result to <code>dst</code>.</p>
+<h3 id="operation_26">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = (i &lt; 2) ? (s32)((s64)a.dword[i] &gt;&gt; (b.dword[i] &amp; 63)) : 0;
+}
+for (int i = 4; i &lt; 8; i++) {
+  dst.word[i] =
+      (i &lt; 6) ? (s32)((s64)a.dword[i - 2] &gt;&gt; (b.dword[i - 2] &amp; 63)) : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_26">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsrani_b_h-__m256i-a-__m256i-b-imm0_15-imm">__m256i __lasx_xvsrani_b_h (__m256i a, __m256i b, imm0_15 imm)</h2>
+<h3 id="synopsis_27">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsrani_b_h (__m256i a, __m256i b, imm0_15 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsrani.b.h xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_27">Description</h3>
+<p>Arithmetic right shift the signed 16-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, truncate to 8-bit and store the result to <code>dst</code>.</p>
+<h3 id="operation_27">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] =
+      (i &lt; 8) ? (s8)((s16)b.half[i] &gt;&gt; imm) : (s8)((s16)a.half[i - 8] &gt;&gt; imm);
+}
+for (int i = 16; i &lt; 32; i++) {
+  dst.byte[i] = (i &lt; 24) ? (s8)((s16)b.half[i - 8] &gt;&gt; imm)
+                         : (s8)((s16)a.half[i - 16] &gt;&gt; imm);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_27">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsrani_h_w-__m256i-a-__m256i-b-imm0_31-imm">__m256i __lasx_xvsrani_h_w (__m256i a, __m256i b, imm0_31 imm)</h2>
+<h3 id="synopsis_28">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsrani_h_w (__m256i a, __m256i b, imm0_31 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsrani.h.w xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_28">Description</h3>
+<p>Arithmetic right shift the signed 32-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, truncate to 16-bit and store the result to <code>dst</code>.</p>
+<h3 id="operation_28">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] =
+      (i &lt; 4) ? (s16)((s32)b.word[i] &gt;&gt; imm) : (s16)((s32)a.word[i - 4] &gt;&gt; imm);
+}
+for (int i = 8; i &lt; 16; i++) {
+  dst.half[i] = (i &lt; 12) ? (s16)((s32)b.word[i - 4] &gt;&gt; imm)
+                         : (s16)((s32)a.word[i - 8] &gt;&gt; imm);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_28">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsrani_w_d-__m256i-a-__m256i-b-imm0_63-imm">__m256i __lasx_xvsrani_w_d (__m256i a, __m256i b, imm0_63 imm)</h2>
+<h3 id="synopsis_29">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsrani_w_d (__m256i a, __m256i b, imm0_63 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsrani.w.d xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_29">Description</h3>
+<p>Arithmetic right shift the signed 64-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, truncate to 32-bit and store the result to <code>dst</code>.</p>
+<h3 id="operation_29">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = (i &lt; 2) ? (s32)((s64)b.dword[i] &gt;&gt; imm)
+                        : (s32)((s64)a.dword[i - 2] &gt;&gt; imm);
+}
+for (int i = 4; i &lt; 8; i++) {
+  dst.word[i] = (i &lt; 6) ? (s32)((s64)b.dword[i - 2] &gt;&gt; imm)
+                        : (s32)((s64)a.dword[i - 4] &gt;&gt; imm);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_29">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsrani_d_q-__m256i-a-__m256i-b-imm0_127-imm">__m256i __lasx_xvsrani_d_q (__m256i a, __m256i b, imm0_127 imm)</h2>
+<h3 id="synopsis_30">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsrani_d_q (__m256i a, __m256i b, imm0_127 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsrani.d.q xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_30">Description</h3>
+<p>Arithmetic right shift the signed 128-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, truncate to 64-bit and store the result to <code>dst</code>.</p>
+<h3 id="operation_30">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = (i &lt; 1) ? (s64)((s128)b.qword[i] &gt;&gt; imm)
+                         : (s64)((s128)a.qword[i - 1] &gt;&gt; imm);
+}
+for (int i = 2; i &lt; 4; i++) {
+  dst.dword[i] = (i &lt; 3) ? (s64)((s128)b.qword[i - 1] &gt;&gt; imm)
+                         : (s64)((s128)a.qword[i - 2] &gt;&gt; imm);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_30">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsrar_b-__m256i-a-__m256i-b">__m256i __lasx_xvsrar_b (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_31">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsrar_b (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsrar.b xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_31">Description</h3>
+<p>Arithmetic right shift (with rounding) the signed 8-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_31">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 32; i++) {
+  if ((b.byte[i] &amp; 0x7) == 0) {
+    dst.byte[i] = a.byte[i];
+  } else {
+    dst.byte[i] = ((s8)a.byte[i] &gt;&gt; (b.byte[i] &amp; 0x7)) +
+                  (((s8)a.byte[i] &gt;&gt; ((b.byte[i] &amp; 0x7) - 1)) &amp; 0x1);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_31">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsrar_h-__m256i-a-__m256i-b">__m256i __lasx_xvsrar_h (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_32">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsrar_h (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsrar.h xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_32">Description</h3>
+<p>Arithmetic right shift (with rounding) the signed 16-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_32">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  if ((b.half[i] &amp; 0xf) == 0) {
+    dst.half[i] = a.half[i];
+  } else {
+    dst.half[i] = ((s16)a.half[i] &gt;&gt; (b.half[i] &amp; 0xf)) +
+                  (((s16)a.half[i] &gt;&gt; ((b.half[i] &amp; 0xf) - 1)) &amp; 0x1);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_32">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsrar_w-__m256i-a-__m256i-b">__m256i __lasx_xvsrar_w (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_33">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsrar_w (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsrar.w xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_33">Description</h3>
+<p>Arithmetic right shift (with rounding) the signed 32-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_33">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  if ((b.word[i] &amp; 0x1f) == 0) {
+    dst.word[i] = a.word[i];
+  } else {
+    dst.word[i] = ((s32)a.word[i] &gt;&gt; (b.word[i] &amp; 0x1f)) +
+                  (((s32)a.word[i] &gt;&gt; ((b.word[i] &amp; 0x1f) - 1)) &amp; 0x1);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_33">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsrar_d-__m256i-a-__m256i-b">__m256i __lasx_xvsrar_d (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_34">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsrar_d (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsrar.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_34">Description</h3>
+<p>Arithmetic right shift (with rounding) the signed 64-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_34">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if ((b.dword[i] &amp; 0x3f) == 0) {
+    dst.dword[i] = a.dword[i];
+  } else {
+    dst.dword[i] = ((s64)a.dword[i] &gt;&gt; (b.dword[i] &amp; 0x3f)) +
+                   (((s64)a.dword[i] &gt;&gt; ((b.dword[i] &amp; 0x3f) - 1)) &amp; 0x1);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_34">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsrari_b-__m256i-a-imm0_7-imm">__m256i __lasx_xvsrari_b (__m256i a, imm0_7 imm)</h2>
+<h3 id="synopsis_35">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsrari_b (__m256i a, imm0_7 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsrari.b xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_35">Description</h3>
+<p>Arithmetic right shift (with rounding) the signed 8-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_35">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 32; i++) {
+  if (imm == 0) {
+    dst.byte[i] = a.byte[i];
+  } else {
+    dst.byte[i] = ((s8)a.byte[i] &gt;&gt; imm) + (((s8)a.byte[i] &gt;&gt; (imm - 1)) &amp; 0x1);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_35">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsrari_h-__m256i-a-imm0_15-imm">__m256i __lasx_xvsrari_h (__m256i a, imm0_15 imm)</h2>
+<h3 id="synopsis_36">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsrari_h (__m256i a, imm0_15 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsrari.h xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_36">Description</h3>
+<p>Arithmetic right shift (with rounding) the signed 16-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_36">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  if (imm == 0) {
+    dst.half[i] = a.half[i];
+  } else {
+    dst.half[i] =
+        ((s16)a.half[i] &gt;&gt; imm) + (((s16)a.half[i] &gt;&gt; (imm - 1)) &amp; 0x1);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_36">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsrari_w-__m256i-a-imm0_31-imm">__m256i __lasx_xvsrari_w (__m256i a, imm0_31 imm)</h2>
+<h3 id="synopsis_37">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsrari_w (__m256i a, imm0_31 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsrari.w xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_37">Description</h3>
+<p>Arithmetic right shift (with rounding) the signed 32-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_37">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  if (imm == 0) {
+    dst.word[i] = a.word[i];
+  } else {
+    dst.word[i] =
+        ((s32)a.word[i] &gt;&gt; imm) + (((s32)a.word[i] &gt;&gt; (imm - 1)) &amp; 0x1);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_37">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsrari_d-__m256i-a-imm0_63-imm">__m256i __lasx_xvsrari_d (__m256i a, imm0_63 imm)</h2>
+<h3 id="synopsis_38">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsrari_d (__m256i a, imm0_63 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsrari.d xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_38">Description</h3>
+<p>Arithmetic right shift (with rounding) the signed 64-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_38">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (imm == 0) {
+    dst.dword[i] = a.dword[i];
+  } else {
+    dst.dword[i] =
+        ((s64)a.dword[i] &gt;&gt; imm) + (((s64)a.dword[i] &gt;&gt; (imm - 1)) &amp; 0x1);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_38">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsrarn_b_h-__m256i-a-__m256i-b">__m256i __lasx_xvsrarn_b_h (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_39">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsrarn_b_h (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsrarn.b.h xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_39">Description</h3>
+<p>Arithmetic right shift (with rounding) the signed 16-bit elements in <code>a</code> by elements in <code>b</code>, truncate to 8-bit and store the result to <code>dst</code>.</p>
+<h3 id="operation_39">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  if (i &lt; 8) {
+    u8 shift = (b.half[i] &amp; 15);
+    if (shift == 0) {
+      dst.byte[i] = (s8)(s16)a.half[i];
+    } else {
+      dst.byte[i] = (s8)(((s16)a.half[i] &gt;&gt; shift) +
+                         (((s16)a.half[i] &gt;&gt; (shift - 1)) &amp; 0x1));
+    }
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+for (int i = 16; i &lt; 32; i++) {
+  if (i &lt; 24) {
+    u8 shift = (b.half[i - 8] &amp; 15);
+    if (shift == 0) {
+      dst.byte[i] = (s8)(s16)a.half[i - 8];
+    } else {
+      dst.byte[i] = (s8)(((s16)a.half[i - 8] &gt;&gt; shift) +
+                         (((s16)a.half[i - 8] &gt;&gt; (shift - 1)) &amp; 0x1));
+    }
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_39">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsrarn_h_w-__m256i-a-__m256i-b">__m256i __lasx_xvsrarn_h_w (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_40">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsrarn_h_w (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsrarn.h.w xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_40">Description</h3>
+<p>Arithmetic right shift (with rounding) the signed 32-bit elements in <code>a</code> by elements in <code>b</code>, truncate to 16-bit and store the result to <code>dst</code>.</p>
+<h3 id="operation_40">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  if (i &lt; 4) {
+    u8 shift = (b.word[i] &amp; 31);
+    if (shift == 0) {
+      dst.half[i] = (s16)(s32)a.word[i];
+    } else {
+      dst.half[i] = (s16)(((s32)a.word[i] &gt;&gt; shift) +
+                          (((s32)a.word[i] &gt;&gt; (shift - 1)) &amp; 0x1));
+    }
+  } else {
+    dst.half[i] = 0;
+  }
+}
+for (int i = 8; i &lt; 16; i++) {
+  if (i &lt; 12) {
+    u8 shift = (b.word[i - 4] &amp; 31);
+    if (shift == 0) {
+      dst.half[i] = (s16)(s32)a.word[i - 4];
+    } else {
+      dst.half[i] = (s16)(((s32)a.word[i - 4] &gt;&gt; shift) +
+                          (((s32)a.word[i - 4] &gt;&gt; (shift - 1)) &amp; 0x1));
+    }
+  } else {
+    dst.half[i] = 0;
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_40">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsrarn_w_d-__m256i-a-__m256i-b">__m256i __lasx_xvsrarn_w_d (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_41">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsrarn_w_d (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsrarn.w.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_41">Description</h3>
+<p>Arithmetic right shift (with rounding) the signed 64-bit elements in <code>a</code> by elements in <code>b</code>, truncate to 32-bit and store the result to <code>dst</code>.</p>
+<h3 id="operation_41">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (i &lt; 2) {
+    u8 shift = (b.dword[i] &amp; 63);
+    if (shift == 0) {
+      dst.word[i] = (s32)(s64)a.dword[i];
+    } else {
+      dst.word[i] = (s32)(((s64)a.dword[i] &gt;&gt; shift) +
+                          (((s64)a.dword[i] &gt;&gt; (shift - 1)) &amp; 0x1));
+    }
+  } else {
+    dst.word[i] = 0;
+  }
+}
+for (int i = 4; i &lt; 8; i++) {
+  if (i &lt; 6) {
+    u8 shift = (b.dword[i - 2] &amp; 63);
+    if (shift == 0) {
+      dst.word[i] = (s32)(s64)a.dword[i - 2];
+    } else {
+      dst.word[i] = (s32)(((s64)a.dword[i - 2] &gt;&gt; shift) +
+                          (((s64)a.dword[i - 2] &gt;&gt; (shift - 1)) &amp; 0x1));
+    }
+  } else {
+    dst.word[i] = 0;
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_41">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsrarni_b_h-__m256i-a-__m256i-b-imm0_15-imm">__m256i __lasx_xvsrarni_b_h (__m256i a, __m256i b, imm0_15 imm)</h2>
+<h3 id="synopsis_42">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsrarni_b_h (__m256i a, __m256i b, imm0_15 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsrarni.b.h xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_42">Description</h3>
+<p>Arithmetic right shift (with rounding) the signed 16-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, truncate to 8-bit and store the result to <code>dst</code>.</p>
+<h3 id="operation_42">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  if (i &lt; 8) {
+    if (imm == 0) {
+      dst.byte[i] = (s8)(s16)b.half[i];
+    } else {
+      dst.byte[i] =
+          (s8)(((s16)b.half[i] &gt;&gt; imm) + (((s16)b.half[i] &gt;&gt; (imm - 1)) &amp; 0x1));
+    }
+  } else {
+    if (imm == 0) {
+      dst.byte[i] = (s8)(s16)a.half[i - 8];
+    } else {
+      dst.byte[i] = (s8)(((s16)a.half[i - 8] &gt;&gt; imm) +
+                         (((s16)a.half[i - 8] &gt;&gt; (imm - 1)) &amp; 0x1));
+    }
+  }
+}
+for (int i = 16; i &lt; 32; i++) {
+  if (i &lt; 24) {
+    if (imm == 0) {
+      dst.byte[i] = (s8)(s16)b.half[i - 8];
+    } else {
+      dst.byte[i] = (s8)(((s16)b.half[i - 8] &gt;&gt; imm) +
+                         (((s16)b.half[i - 8] &gt;&gt; (imm - 1)) &amp; 0x1));
+    }
+  } else {
+    if (imm == 0) {
+      dst.byte[i] = (s8)(s16)a.half[i - 16];
+    } else {
+      dst.byte[i] = (s8)(((s16)a.half[i - 16] &gt;&gt; imm) +
+                         (((s16)a.half[i - 16] &gt;&gt; (imm - 1)) &amp; 0x1));
+    }
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_42">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsrarni_h_w-__m256i-a-__m256i-b-imm0_31-imm">__m256i __lasx_xvsrarni_h_w (__m256i a, __m256i b, imm0_31 imm)</h2>
+<h3 id="synopsis_43">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsrarni_h_w (__m256i a, __m256i b, imm0_31 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsrarni.h.w xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_43">Description</h3>
+<p>Arithmetic right shift (with rounding) the signed 32-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, truncate to 16-bit and store the result to <code>dst</code>.</p>
+<h3 id="operation_43">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  if (i &lt; 4) {
+    if (imm == 0) {
+      dst.half[i] = (s16)(s32)b.word[i];
+    } else {
+      dst.half[i] = (s16)(((s32)b.word[i] &gt;&gt; imm) +
+                          (((s32)b.word[i] &gt;&gt; (imm - 1)) &amp; 0x1));
+    }
+  } else {
+    if (imm == 0) {
+      dst.half[i] = (s16)(s32)a.word[i - 4];
+    } else {
+      dst.half[i] = (s16)(((s32)a.word[i - 4] &gt;&gt; imm) +
+                          (((s32)a.word[i - 4] &gt;&gt; (imm - 1)) &amp; 0x1));
+    }
+  }
+}
+for (int i = 8; i &lt; 16; i++) {
+  if (i &lt; 12) {
+    if (imm == 0) {
+      dst.half[i] = (s16)(s32)b.word[i - 4];
+    } else {
+      dst.half[i] = (s16)(((s32)b.word[i - 4] &gt;&gt; imm) +
+                          (((s32)b.word[i - 4] &gt;&gt; (imm - 1)) &amp; 0x1));
+    }
+  } else {
+    if (imm == 0) {
+      dst.half[i] = (s16)(s32)a.word[i - 8];
+    } else {
+      dst.half[i] = (s16)(((s32)a.word[i - 8] &gt;&gt; imm) +
+                          (((s32)a.word[i - 8] &gt;&gt; (imm - 1)) &amp; 0x1));
+    }
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_43">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsrarni_w_d-__m256i-a-__m256i-b-imm0_63-imm">__m256i __lasx_xvsrarni_w_d (__m256i a, __m256i b, imm0_63 imm)</h2>
+<h3 id="synopsis_44">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsrarni_w_d (__m256i a, __m256i b, imm0_63 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsrarni.w.d xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_44">Description</h3>
+<p>Arithmetic right shift (with rounding) the signed 64-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, truncate to 32-bit and store the result to <code>dst</code>.</p>
+<h3 id="operation_44">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (i &lt; 2) {
+    if (imm == 0) {
+      dst.word[i] = (s32)(s64)b.dword[i];
+    } else {
+      dst.word[i] = (s32)(((s64)b.dword[i] &gt;&gt; imm) +
+                          (((s64)b.dword[i] &gt;&gt; (imm - 1)) &amp; 0x1));
+    }
+  } else {
+    if (imm == 0) {
+      dst.word[i] = (s32)(s64)a.dword[i - 2];
+    } else {
+      dst.word[i] = (s32)(((s64)a.dword[i - 2] &gt;&gt; imm) +
+                          (((s64)a.dword[i - 2] &gt;&gt; (imm - 1)) &amp; 0x1));
+    }
+  }
+}
+for (int i = 4; i &lt; 8; i++) {
+  if (i &lt; 6) {
+    if (imm == 0) {
+      dst.word[i] = (s32)(s64)b.dword[i - 2];
+    } else {
+      dst.word[i] = (s32)(((s64)b.dword[i - 2] &gt;&gt; imm) +
+                          (((s64)b.dword[i - 2] &gt;&gt; (imm - 1)) &amp; 0x1));
+    }
+  } else {
+    if (imm == 0) {
+      dst.word[i] = (s32)(s64)a.dword[i - 4];
+    } else {
+      dst.word[i] = (s32)(((s64)a.dword[i - 4] &gt;&gt; imm) +
+                          (((s64)a.dword[i - 4] &gt;&gt; (imm - 1)) &amp; 0x1));
+    }
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_44">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsrarni_d_q-__m256i-a-__m256i-b-imm0_127-imm">__m256i __lasx_xvsrarni_d_q (__m256i a, __m256i b, imm0_127 imm)</h2>
+<h3 id="synopsis_45">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsrarni_d_q (__m256i a, __m256i b, imm0_127 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsrarni.d.q xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_45">Description</h3>
+<p>Arithmetic right shift (with rounding) the signed 128-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, truncate to 64-bit and store the result to <code>dst</code>.</p>
+<h3 id="operation_45">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  if (i &lt; 1) {
+    if (imm == 0) {
+      dst.dword[i] = (s64)(s128)b.qword[i];
+    } else {
+      dst.dword[i] = (s64)(((s128)b.qword[i] &gt;&gt; imm) +
+                           (((s128)b.qword[i] &gt;&gt; (imm - 1)) &amp; 0x1));
+    }
+  } else {
+    if (imm == 0) {
+      dst.dword[i] = (s64)(s128)a.qword[i - 1];
+    } else {
+      dst.dword[i] = (s64)(((s128)a.qword[i - 1] &gt;&gt; imm) +
+                           (((s128)a.qword[i - 1] &gt;&gt; (imm - 1)) &amp; 0x1));
+    }
+  }
+}
+for (int i = 2; i &lt; 4; i++) {
+  if (i &lt; 3) {
+    if (imm == 0) {
+      dst.dword[i] = (s64)(s128)b.qword[i - 1];
+    } else {
+      dst.dword[i] = (s64)(((s128)b.qword[i - 1] &gt;&gt; imm) +
+                           (((s128)b.qword[i - 1] &gt;&gt; (imm - 1)) &amp; 0x1));
+    }
+  } else {
+    if (imm == 0) {
+      dst.dword[i] = (s64)(s128)a.qword[i - 2];
+    } else {
+      dst.dword[i] = (s64)(((s128)a.qword[i - 2] &gt;&gt; imm) +
+                           (((s128)a.qword[i - 2] &gt;&gt; (imm - 1)) &amp; 0x1));
+    }
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_45">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsrl_b-__m256i-a-__m256i-b">__m256i __lasx_xvsrl_b (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_46">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsrl_b (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsrl.b xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_46">Description</h3>
+<p>Logical right shift the unsigned 8-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_46">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 32; i++) {
+  dst.byte[i] = a.byte[i] &gt;&gt; (b.byte[i] &amp; 0x7);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_46">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsrl_h-__m256i-a-__m256i-b">__m256i __lasx_xvsrl_h (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_47">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsrl_h (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsrl.h xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_47">Description</h3>
+<p>Logical right shift the unsigned 16-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_47">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = a.half[i] &gt;&gt; (b.half[i] &amp; 0xf);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_47">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsrl_w-__m256i-a-__m256i-b">__m256i __lasx_xvsrl_w (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_48">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsrl_w (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsrl.w xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_48">Description</h3>
+<p>Logical right shift the unsigned 32-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_48">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = a.word[i] &gt;&gt; (b.word[i] &amp; 0x1f);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_48">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsrl_d-__m256i-a-__m256i-b">__m256i __lasx_xvsrl_d (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_49">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsrl_d (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsrl.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_49">Description</h3>
+<p>Logical right shift the unsigned 64-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_49">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = a.dword[i] &gt;&gt; (b.dword[i] &amp; 0x3f);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_49">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsrli_b-__m256i-a-imm0_7-imm">__m256i __lasx_xvsrli_b (__m256i a, imm0_7 imm)</h2>
+<h3 id="synopsis_50">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsrli_b (__m256i a, imm0_7 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsrli.b xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_50">Description</h3>
+<p>Logical right shift the unsigned 8-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_50">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 32; i++) {
+  dst.byte[i] = a.byte[i] &gt;&gt; imm;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_50">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsrli_h-__m256i-a-imm0_15-imm">__m256i __lasx_xvsrli_h (__m256i a, imm0_15 imm)</h2>
+<h3 id="synopsis_51">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsrli_h (__m256i a, imm0_15 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsrli.h xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_51">Description</h3>
+<p>Logical right shift the unsigned 16-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_51">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = a.half[i] &gt;&gt; imm;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_51">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsrli_w-__m256i-a-imm0_31-imm">__m256i __lasx_xvsrli_w (__m256i a, imm0_31 imm)</h2>
+<h3 id="synopsis_52">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsrli_w (__m256i a, imm0_31 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsrli.w xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_52">Description</h3>
+<p>Logical right shift the unsigned 32-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_52">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = a.word[i] &gt;&gt; imm;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_52">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsrli_d-__m256i-a-imm0_63-imm">__m256i __lasx_xvsrli_d (__m256i a, imm0_63 imm)</h2>
+<h3 id="synopsis_53">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsrli_d (__m256i a, imm0_63 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsrli.d xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_53">Description</h3>
+<p>Logical right shift the unsigned 64-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_53">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = a.dword[i] &gt;&gt; imm;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_53">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsrln_b_h-__m256i-a-__m256i-b">__m256i __lasx_xvsrln_b_h (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_54">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsrln_b_h (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsrln.b.h xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_54">Description</h3>
+<p>Logical right shift the unsigned 16-bit elements in <code>a</code> by elements in <code>b</code>, truncate to 8-bit and store the result to <code>dst</code>.</p>
+<h3 id="operation_54">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = (i &lt; 8) ? (u8)((u16)a.half[i] &gt;&gt; (b.half[i] &amp; 15)) : 0;
+}
+for (int i = 16; i &lt; 32; i++) {
+  dst.byte[i] = (i &lt; 24) ? (u8)((u16)a.half[i - 8] &gt;&gt; (b.half[i - 8] &amp; 15)) : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_54">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsrln_h_w-__m256i-a-__m256i-b">__m256i __lasx_xvsrln_h_w (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_55">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsrln_h_w (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsrln.h.w xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_55">Description</h3>
+<p>Logical right shift the unsigned 32-bit elements in <code>a</code> by elements in <code>b</code>, truncate to 16-bit and store the result to <code>dst</code>.</p>
+<h3 id="operation_55">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = (i &lt; 4) ? (u16)((u32)a.word[i] &gt;&gt; (b.word[i] &amp; 31)) : 0;
+}
+for (int i = 8; i &lt; 16; i++) {
+  dst.half[i] =
+      (i &lt; 12) ? (u16)((u32)a.word[i - 4] &gt;&gt; (b.word[i - 4] &amp; 31)) : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_55">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsrln_w_d-__m256i-a-__m256i-b">__m256i __lasx_xvsrln_w_d (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_56">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsrln_w_d (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsrln.w.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_56">Description</h3>
+<p>Logical right shift the unsigned 64-bit elements in <code>a</code> by elements in <code>b</code>, truncate to 32-bit and store the result to <code>dst</code>.</p>
+<h3 id="operation_56">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = (i &lt; 2) ? (u32)((u64)a.dword[i] &gt;&gt; (b.dword[i] &amp; 63)) : 0;
+}
+for (int i = 4; i &lt; 8; i++) {
+  dst.word[i] =
+      (i &lt; 6) ? (u32)((u64)a.dword[i - 2] &gt;&gt; (b.dword[i - 2] &amp; 63)) : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_56">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsrlni_b_h-__m256i-a-__m256i-b-imm0_15-imm">__m256i __lasx_xvsrlni_b_h (__m256i a, __m256i b, imm0_15 imm)</h2>
+<h3 id="synopsis_57">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsrlni_b_h (__m256i a, __m256i b, imm0_15 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsrlni.b.h xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_57">Description</h3>
+<p>Logical right shift the unsigned 16-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, truncate to 8-bit and store the result to <code>dst</code>.</p>
+<h3 id="operation_57">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] =
+      (i &lt; 8) ? (u8)((u16)b.half[i] &gt;&gt; imm) : (u8)((u16)a.half[i - 8] &gt;&gt; imm);
+}
+for (int i = 16; i &lt; 32; i++) {
+  dst.byte[i] = (i &lt; 24) ? (u8)((u16)b.half[i - 8] &gt;&gt; imm)
+                         : (u8)((u16)a.half[i - 16] &gt;&gt; imm);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_57">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsrlni_h_w-__m256i-a-__m256i-b-imm0_31-imm">__m256i __lasx_xvsrlni_h_w (__m256i a, __m256i b, imm0_31 imm)</h2>
+<h3 id="synopsis_58">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsrlni_h_w (__m256i a, __m256i b, imm0_31 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsrlni.h.w xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_58">Description</h3>
+<p>Logical right shift the unsigned 32-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, truncate to 16-bit and store the result to <code>dst</code>.</p>
+<h3 id="operation_58">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] =
+      (i &lt; 4) ? (u16)((u32)b.word[i] &gt;&gt; imm) : (u16)((u32)a.word[i - 4] &gt;&gt; imm);
+}
+for (int i = 8; i &lt; 16; i++) {
+  dst.half[i] = (i &lt; 12) ? (u16)((u32)b.word[i - 4] &gt;&gt; imm)
+                         : (u16)((u32)a.word[i - 8] &gt;&gt; imm);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_58">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsrlni_w_d-__m256i-a-__m256i-b-imm0_63-imm">__m256i __lasx_xvsrlni_w_d (__m256i a, __m256i b, imm0_63 imm)</h2>
+<h3 id="synopsis_59">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsrlni_w_d (__m256i a, __m256i b, imm0_63 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsrlni.w.d xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_59">Description</h3>
+<p>Logical right shift the unsigned 64-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, truncate to 32-bit and store the result to <code>dst</code>.</p>
+<h3 id="operation_59">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = (i &lt; 2) ? (u32)((u64)b.dword[i] &gt;&gt; imm)
+                        : (u32)((u64)a.dword[i - 2] &gt;&gt; imm);
+}
+for (int i = 4; i &lt; 8; i++) {
+  dst.word[i] = (i &lt; 6) ? (u32)((u64)b.dword[i - 2] &gt;&gt; imm)
+                        : (u32)((u64)a.dword[i - 4] &gt;&gt; imm);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_59">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsrlni_d_q-__m256i-a-__m256i-b-imm0_127-imm">__m256i __lasx_xvsrlni_d_q (__m256i a, __m256i b, imm0_127 imm)</h2>
+<h3 id="synopsis_60">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsrlni_d_q (__m256i a, __m256i b, imm0_127 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsrlni.d.q xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_60">Description</h3>
+<p>Logical right shift the unsigned 128-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, truncate to 64-bit and store the result to <code>dst</code>.</p>
+<h3 id="operation_60">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = (i &lt; 1) ? (u64)((u128)b.qword[i] &gt;&gt; imm)
+                         : (u64)((u128)a.qword[i - 1] &gt;&gt; imm);
+}
+for (int i = 2; i &lt; 4; i++) {
+  dst.dword[i] = (i &lt; 3) ? (u64)((u128)b.qword[i - 1] &gt;&gt; imm)
+                         : (u64)((u128)a.qword[i - 2] &gt;&gt; imm);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_60">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsrlr_b-__m256i-a-__m256i-b">__m256i __lasx_xvsrlr_b (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_61">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsrlr_b (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsrlr.b xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_61">Description</h3>
+<p>Logical right shift (with rounding) the unsigned 8-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_61">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 32; i++) {
+  if ((b.byte[i] &amp; 0x7) == 0) {
+    dst.byte[i] = a.byte[i];
+  } else {
+    dst.byte[i] = (a.byte[i] &gt;&gt; (b.byte[i] &amp; 0x7)) +
+                  ((a.byte[i] &gt;&gt; ((b.byte[i] &amp; 0x7) - 1)) &amp; 0x1);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_61">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsrlr_h-__m256i-a-__m256i-b">__m256i __lasx_xvsrlr_h (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_62">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsrlr_h (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsrlr.h xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_62">Description</h3>
+<p>Logical right shift (with rounding) the unsigned 16-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_62">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  if ((b.half[i] &amp; 0xf) == 0) {
+    dst.half[i] = a.half[i];
+  } else {
+    dst.half[i] = (a.half[i] &gt;&gt; (b.half[i] &amp; 0xf)) +
+                  ((a.half[i] &gt;&gt; ((b.half[i] &amp; 0xf) - 1)) &amp; 0x1);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_62">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsrlr_w-__m256i-a-__m256i-b">__m256i __lasx_xvsrlr_w (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_63">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsrlr_w (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsrlr.w xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_63">Description</h3>
+<p>Logical right shift (with rounding) the unsigned 32-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_63">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  if ((b.word[i] &amp; 0x1f) == 0) {
+    dst.word[i] = a.word[i];
+  } else {
+    dst.word[i] = (a.word[i] &gt;&gt; (b.word[i] &amp; 0x1f)) +
+                  ((a.word[i] &gt;&gt; ((b.word[i] &amp; 0x1f) - 1)) &amp; 0x1);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_63">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsrlr_d-__m256i-a-__m256i-b">__m256i __lasx_xvsrlr_d (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_64">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsrlr_d (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsrlr.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_64">Description</h3>
+<p>Logical right shift (with rounding) the unsigned 64-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_64">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if ((b.dword[i] &amp; 0x3f) == 0) {
+    dst.dword[i] = a.dword[i];
+  } else {
+    dst.dword[i] = (a.dword[i] &gt;&gt; (b.dword[i] &amp; 0x3f)) +
+                   ((a.dword[i] &gt;&gt; ((b.dword[i] &amp; 0x3f) - 1)) &amp; 0x1);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_64">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsrlri_b-__m256i-a-imm0_7-imm">__m256i __lasx_xvsrlri_b (__m256i a, imm0_7 imm)</h2>
+<h3 id="synopsis_65">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsrlri_b (__m256i a, imm0_7 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsrlri.b xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_65">Description</h3>
+<p>Logical right shift (with rounding) the unsigned 8-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_65">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 32; i++) {
+  if (imm == 0) {
+    dst.byte[i] = a.byte[i];
+  } else {
+    dst.byte[i] = (a.byte[i] &gt;&gt; imm) + ((a.byte[i] &gt;&gt; (imm - 1)) &amp; 0x1);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_65">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsrlri_h-__m256i-a-imm0_15-imm">__m256i __lasx_xvsrlri_h (__m256i a, imm0_15 imm)</h2>
+<h3 id="synopsis_66">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsrlri_h (__m256i a, imm0_15 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsrlri.h xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_66">Description</h3>
+<p>Logical right shift (with rounding) the unsigned 16-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_66">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  if (imm == 0) {
+    dst.half[i] = a.half[i];
+  } else {
+    dst.half[i] = (a.half[i] &gt;&gt; imm) + ((a.half[i] &gt;&gt; (imm - 1)) &amp; 0x1);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_66">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsrlri_w-__m256i-a-imm0_31-imm">__m256i __lasx_xvsrlri_w (__m256i a, imm0_31 imm)</h2>
+<h3 id="synopsis_67">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsrlri_w (__m256i a, imm0_31 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsrlri.w xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_67">Description</h3>
+<p>Logical right shift (with rounding) the unsigned 32-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_67">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  if (imm == 0) {
+    dst.word[i] = a.word[i];
+  } else {
+    dst.word[i] = (a.word[i] &gt;&gt; imm) + ((a.word[i] &gt;&gt; (imm - 1)) &amp; 0x1);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_67">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsrlri_d-__m256i-a-imm0_63-imm">__m256i __lasx_xvsrlri_d (__m256i a, imm0_63 imm)</h2>
+<h3 id="synopsis_68">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsrlri_d (__m256i a, imm0_63 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsrlri.d xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_68">Description</h3>
+<p>Logical right shift (with rounding) the unsigned 64-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_68">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (imm == 0) {
+    dst.dword[i] = a.dword[i];
+  } else {
+    dst.dword[i] = (a.dword[i] &gt;&gt; imm) + ((a.dword[i] &gt;&gt; (imm - 1)) &amp; 0x1);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_68">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsrlrn_b_h-__m256i-a-__m256i-b">__m256i __lasx_xvsrlrn_b_h (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_69">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsrlrn_b_h (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsrlrn.b.h xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_69">Description</h3>
+<p>Logical right shift (with rounding) the unsigned 16-bit elements in <code>a</code> by elements in <code>b</code>, truncate to 8-bit and store the result to <code>dst</code>.</p>
+<h3 id="operation_69">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  if (i &lt; 8) {
+    u8 shift = (b.half[i] &amp; 15);
+    if (shift == 0) {
+      dst.byte[i] = (u8)(u16)a.half[i];
+    } else {
+      dst.byte[i] = (u8)(((u16)a.half[i] &gt;&gt; shift) +
+                         (((u16)a.half[i] &gt;&gt; (shift - 1)) &amp; 0x1));
+    }
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+for (int i = 16; i &lt; 32; i++) {
+  if (i &lt; 24) {
+    u8 shift = (b.half[i - 8] &amp; 15);
+    if (shift == 0) {
+      dst.byte[i] = (u8)(u16)a.half[i - 8];
+    } else {
+      dst.byte[i] = (u8)(((u16)a.half[i - 8] &gt;&gt; shift) +
+                         (((u16)a.half[i - 8] &gt;&gt; (shift - 1)) &amp; 0x1));
+    }
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_69">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsrlrn_h_w-__m256i-a-__m256i-b">__m256i __lasx_xvsrlrn_h_w (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_70">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsrlrn_h_w (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsrlrn.h.w xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_70">Description</h3>
+<p>Logical right shift (with rounding) the unsigned 32-bit elements in <code>a</code> by elements in <code>b</code>, truncate to 16-bit and store the result to <code>dst</code>.</p>
+<h3 id="operation_70">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  if (i &lt; 4) {
+    u8 shift = (b.word[i] &amp; 31);
+    if (shift == 0) {
+      dst.half[i] = (u16)(u32)a.word[i];
+    } else {
+      dst.half[i] = (u16)(((u32)a.word[i] &gt;&gt; shift) +
+                          (((u32)a.word[i] &gt;&gt; (shift - 1)) &amp; 0x1));
+    }
+  } else {
+    dst.half[i] = 0;
+  }
+}
+for (int i = 8; i &lt; 16; i++) {
+  if (i &lt; 12) {
+    u8 shift = (b.word[i - 4] &amp; 31);
+    if (shift == 0) {
+      dst.half[i] = (u16)(u32)a.word[i - 4];
+    } else {
+      dst.half[i] = (u16)(((u32)a.word[i - 4] &gt;&gt; shift) +
+                          (((u32)a.word[i - 4] &gt;&gt; (shift - 1)) &amp; 0x1));
+    }
+  } else {
+    dst.half[i] = 0;
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_70">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsrlrn_w_d-__m256i-a-__m256i-b">__m256i __lasx_xvsrlrn_w_d (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_71">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsrlrn_w_d (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsrlrn.w.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_71">Description</h3>
+<p>Logical right shift (with rounding) the unsigned 64-bit elements in <code>a</code> by elements in <code>b</code>, truncate to 32-bit and store the result to <code>dst</code>.</p>
+<h3 id="operation_71">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (i &lt; 2) {
+    u8 shift = (b.dword[i] &amp; 63);
+    if (shift == 0) {
+      dst.word[i] = (u32)(u64)a.dword[i];
+    } else {
+      dst.word[i] = (u32)(((u64)a.dword[i] &gt;&gt; shift) +
+                          (((u64)a.dword[i] &gt;&gt; (shift - 1)) &amp; 0x1));
+    }
+  } else {
+    dst.word[i] = 0;
+  }
+}
+for (int i = 4; i &lt; 8; i++) {
+  if (i &lt; 6) {
+    u8 shift = (b.dword[i - 2] &amp; 63);
+    if (shift == 0) {
+      dst.word[i] = (u32)(u64)a.dword[i - 2];
+    } else {
+      dst.word[i] = (u32)(((u64)a.dword[i - 2] &gt;&gt; shift) +
+                          (((u64)a.dword[i - 2] &gt;&gt; (shift - 1)) &amp; 0x1));
+    }
+  } else {
+    dst.word[i] = 0;
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_71">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsrlrni_b_h-__m256i-a-__m256i-b-imm0_15-imm">__m256i __lasx_xvsrlrni_b_h (__m256i a, __m256i b, imm0_15 imm)</h2>
+<h3 id="synopsis_72">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsrlrni_b_h (__m256i a, __m256i b, imm0_15 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsrlrni.b.h xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_72">Description</h3>
+<p>Logical right shift (with rounding) the unsigned 16-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, truncate to 8-bit and store the result to <code>dst</code>.</p>
+<h3 id="operation_72">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  if (i &lt; 8) {
+    if (imm == 0) {
+      dst.byte[i] = (u8)(u16)b.half[i];
+    } else {
+      dst.byte[i] =
+          (u8)(((u16)b.half[i] &gt;&gt; imm) + (((u16)b.half[i] &gt;&gt; (imm - 1)) &amp; 0x1));
+    }
+  } else {
+    if (imm == 0) {
+      dst.byte[i] = (u8)(u16)a.half[i - 8];
+    } else {
+      dst.byte[i] = (u8)(((u16)a.half[i - 8] &gt;&gt; imm) +
+                         (((u16)a.half[i - 8] &gt;&gt; (imm - 1)) &amp; 0x1));
+    }
+  }
+}
+for (int i = 16; i &lt; 32; i++) {
+  if (i &lt; 24) {
+    if (imm == 0) {
+      dst.byte[i] = (u8)(u16)b.half[i - 8];
+    } else {
+      dst.byte[i] = (u8)(((u16)b.half[i - 8] &gt;&gt; imm) +
+                         (((u16)b.half[i - 8] &gt;&gt; (imm - 1)) &amp; 0x1));
+    }
+  } else {
+    if (imm == 0) {
+      dst.byte[i] = (u8)(u16)a.half[i - 16];
+    } else {
+      dst.byte[i] = (u8)(((u16)a.half[i - 16] &gt;&gt; imm) +
+                         (((u16)a.half[i - 16] &gt;&gt; (imm - 1)) &amp; 0x1));
+    }
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_72">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsrlrni_h_w-__m256i-a-__m256i-b-imm0_31-imm">__m256i __lasx_xvsrlrni_h_w (__m256i a, __m256i b, imm0_31 imm)</h2>
+<h3 id="synopsis_73">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsrlrni_h_w (__m256i a, __m256i b, imm0_31 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsrlrni.h.w xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_73">Description</h3>
+<p>Logical right shift (with rounding) the unsigned 32-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, truncate to 16-bit and store the result to <code>dst</code>.</p>
+<h3 id="operation_73">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  if (i &lt; 4) {
+    if (imm == 0) {
+      dst.half[i] = (u16)(u32)b.word[i];
+    } else {
+      dst.half[i] = (u16)(((u32)b.word[i] &gt;&gt; imm) +
+                          (((u32)b.word[i] &gt;&gt; (imm - 1)) &amp; 0x1));
+    }
+  } else {
+    if (imm == 0) {
+      dst.half[i] = (u16)(u32)a.word[i - 4];
+    } else {
+      dst.half[i] = (u16)(((u32)a.word[i - 4] &gt;&gt; imm) +
+                          (((u32)a.word[i - 4] &gt;&gt; (imm - 1)) &amp; 0x1));
+    }
+  }
+}
+for (int i = 8; i &lt; 16; i++) {
+  if (i &lt; 12) {
+    if (imm == 0) {
+      dst.half[i] = (u16)(u32)b.word[i - 4];
+    } else {
+      dst.half[i] = (u16)(((u32)b.word[i - 4] &gt;&gt; imm) +
+                          (((u32)b.word[i - 4] &gt;&gt; (imm - 1)) &amp; 0x1));
+    }
+  } else {
+    if (imm == 0) {
+      dst.half[i] = (u16)(u32)a.word[i - 8];
+    } else {
+      dst.half[i] = (u16)(((u32)a.word[i - 8] &gt;&gt; imm) +
+                          (((u32)a.word[i - 8] &gt;&gt; (imm - 1)) &amp; 0x1));
+    }
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_73">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsrlrni_w_d-__m256i-a-__m256i-b-imm0_63-imm">__m256i __lasx_xvsrlrni_w_d (__m256i a, __m256i b, imm0_63 imm)</h2>
+<h3 id="synopsis_74">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsrlrni_w_d (__m256i a, __m256i b, imm0_63 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsrlrni.w.d xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_74">Description</h3>
+<p>Logical right shift (with rounding) the unsigned 64-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, truncate to 32-bit and store the result to <code>dst</code>.</p>
+<h3 id="operation_74">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (i &lt; 2) {
+    if (imm == 0) {
+      dst.word[i] = (u32)(u64)b.dword[i];
+    } else {
+      dst.word[i] = (u32)(((u64)b.dword[i] &gt;&gt; imm) +
+                          (((u64)b.dword[i] &gt;&gt; (imm - 1)) &amp; 0x1));
+    }
+  } else {
+    if (imm == 0) {
+      dst.word[i] = (u32)(u64)a.dword[i - 2];
+    } else {
+      dst.word[i] = (u32)(((u64)a.dword[i - 2] &gt;&gt; imm) +
+                          (((u64)a.dword[i - 2] &gt;&gt; (imm - 1)) &amp; 0x1));
+    }
+  }
+}
+for (int i = 4; i &lt; 8; i++) {
+  if (i &lt; 6) {
+    if (imm == 0) {
+      dst.word[i] = (u32)(u64)b.dword[i - 2];
+    } else {
+      dst.word[i] = (u32)(((u64)b.dword[i - 2] &gt;&gt; imm) +
+                          (((u64)b.dword[i - 2] &gt;&gt; (imm - 1)) &amp; 0x1));
+    }
+  } else {
+    if (imm == 0) {
+      dst.word[i] = (u32)(u64)a.dword[i - 4];
+    } else {
+      dst.word[i] = (u32)(((u64)a.dword[i - 4] &gt;&gt; imm) +
+                          (((u64)a.dword[i - 4] &gt;&gt; (imm - 1)) &amp; 0x1));
+    }
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_74">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvsrlrni_d_q-__m256i-a-__m256i-b-imm0_127-imm">__m256i __lasx_xvsrlrni_d_q (__m256i a, __m256i b, imm0_127 imm)</h2>
+<h3 id="synopsis_75">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvsrlrni_d_q (__m256i a, __m256i b, imm0_127 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvsrlrni.d.q xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_75">Description</h3>
+<p>Logical right shift (with rounding) the unsigned 128-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, truncate to 64-bit and store the result to <code>dst</code>.</p>
+<h3 id="operation_75">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  if (i &lt; 1) {
+    if (imm == 0) {
+      dst.dword[i] = (u64)(u128)b.qword[i];
+    } else {
+      dst.dword[i] = (u64)(((u128)b.qword[i] &gt;&gt; imm) +
+                           (((u128)b.qword[i] &gt;&gt; (imm - 1)) &amp; 0x1));
+    }
+  } else {
+    if (imm == 0) {
+      dst.dword[i] = (u64)(u128)a.qword[i - 1];
+    } else {
+      dst.dword[i] = (u64)(((u128)a.qword[i - 1] &gt;&gt; imm) +
+                           (((u128)a.qword[i - 1] &gt;&gt; (imm - 1)) &amp; 0x1));
+    }
+  }
+}
+for (int i = 2; i &lt; 4; i++) {
+  if (i &lt; 3) {
+    if (imm == 0) {
+      dst.dword[i] = (u64)(u128)b.qword[i - 1];
+    } else {
+      dst.dword[i] = (u64)(((u128)b.qword[i - 1] &gt;&gt; imm) +
+                           (((u128)b.qword[i - 1] &gt;&gt; (imm - 1)) &amp; 0x1));
+    }
+  } else {
+    if (imm == 0) {
+      dst.dword[i] = (u64)(u128)a.qword[i - 2];
+    } else {
+      dst.dword[i] = (u64)(((u128)a.qword[i - 2] &gt;&gt; imm) +
+                           (((u128)a.qword[i - 2] &gt;&gt; (imm - 1)) &amp; 0x1));
+    }
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_75">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvssran_b_h-__m256i-a-__m256i-b">__m256i __lasx_xvssran_b_h (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_76">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvssran_b_h (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvssran.b.h xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_76">Description</h3>
+<p>Arithmetic right shift the signed 16-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in signed 8-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_76">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  if (i &lt; 8) {
+    s16 temp = (s16)a.half[i] &gt;&gt; (b.half[i] &amp; 15);
+    dst.byte[i] = clamp&lt;s16&gt;(temp, -128, 127);
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+for (int i = 16; i &lt; 32; i++) {
+  if (i &lt; 24) {
+    s16 temp = (s16)a.half[i - 8] &gt;&gt; (b.half[i - 8] &amp; 15);
+    dst.byte[i] = clamp&lt;s16&gt;(temp, -128, 127);
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_76">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvssran_bu_h-__m256i-a-__m256i-b">__m256i __lasx_xvssran_bu_h (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_77">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvssran_bu_h (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvssran.bu.h xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_77">Description</h3>
+<p>Arithmetic right shift the signed 16-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in unsigned 8-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_77">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  if (i &lt; 8) {
+    s16 temp = (s16)a.half[i] &gt;&gt; (b.half[i] &amp; 15);
+    dst.byte[i] = clamp&lt;s16&gt;(temp, 0, 255);
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+for (int i = 16; i &lt; 32; i++) {
+  if (i &lt; 24) {
+    s16 temp = (s16)a.half[i - 8] &gt;&gt; (b.half[i - 8] &amp; 15);
+    dst.byte[i] = clamp&lt;s16&gt;(temp, 0, 255);
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_77">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvssran_h_w-__m256i-a-__m256i-b">__m256i __lasx_xvssran_h_w (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_78">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvssran_h_w (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvssran.h.w xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_78">Description</h3>
+<p>Arithmetic right shift the signed 32-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in signed 16-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_78">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  if (i &lt; 4) {
+    s32 temp = (s32)a.word[i] &gt;&gt; (b.word[i] &amp; 31);
+    dst.half[i] = clamp&lt;s32&gt;(temp, -32768, 32767);
+  } else {
+    dst.half[i] = 0;
+  }
+}
+for (int i = 8; i &lt; 16; i++) {
+  if (i &lt; 12) {
+    s32 temp = (s32)a.word[i - 4] &gt;&gt; (b.word[i - 4] &amp; 31);
+    dst.half[i] = clamp&lt;s32&gt;(temp, -32768, 32767);
+  } else {
+    dst.half[i] = 0;
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_78">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvssran_hu_w-__m256i-a-__m256i-b">__m256i __lasx_xvssran_hu_w (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_79">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvssran_hu_w (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvssran.hu.w xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_79">Description</h3>
+<p>Arithmetic right shift the signed 32-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in unsigned 16-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_79">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  if (i &lt; 4) {
+    s32 temp = (s32)a.word[i] &gt;&gt; (b.word[i] &amp; 31);
+    dst.half[i] = clamp&lt;s32&gt;(temp, 0, 65535);
+  } else {
+    dst.half[i] = 0;
+  }
+}
+for (int i = 8; i &lt; 16; i++) {
+  if (i &lt; 12) {
+    s32 temp = (s32)a.word[i - 4] &gt;&gt; (b.word[i - 4] &amp; 31);
+    dst.half[i] = clamp&lt;s32&gt;(temp, 0, 65535);
+  } else {
+    dst.half[i] = 0;
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_79">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvssran_w_d-__m256i-a-__m256i-b">__m256i __lasx_xvssran_w_d (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_80">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvssran_w_d (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvssran.w.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_80">Description</h3>
+<p>Arithmetic right shift the signed 64-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in signed 32-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_80">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (i &lt; 2) {
+    s64 temp = (s64)a.dword[i] &gt;&gt; (b.dword[i] &amp; 63);
+    dst.word[i] = clamp&lt;s64&gt;(temp, -2147483648, 2147483647);
+  } else {
+    dst.word[i] = 0;
+  }
+}
+for (int i = 4; i &lt; 8; i++) {
+  if (i &lt; 6) {
+    s64 temp = (s64)a.dword[i - 2] &gt;&gt; (b.dword[i - 2] &amp; 63);
+    dst.word[i] = clamp&lt;s64&gt;(temp, -2147483648, 2147483647);
+  } else {
+    dst.word[i] = 0;
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_80">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvssran_wu_d-__m256i-a-__m256i-b">__m256i __lasx_xvssran_wu_d (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_81">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvssran_wu_d (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvssran.wu.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_81">Description</h3>
+<p>Arithmetic right shift the signed 64-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in unsigned 32-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_81">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (i &lt; 2) {
+    s64 temp = (s64)a.dword[i] &gt;&gt; (b.dword[i] &amp; 63);
+    dst.word[i] = clamp&lt;s64&gt;(temp, 0, 4294967295);
+  } else {
+    dst.word[i] = 0;
+  }
+}
+for (int i = 4; i &lt; 8; i++) {
+  if (i &lt; 6) {
+    s64 temp = (s64)a.dword[i - 2] &gt;&gt; (b.dword[i - 2] &amp; 63);
+    dst.word[i] = clamp&lt;s64&gt;(temp, 0, 4294967295);
+  } else {
+    dst.word[i] = 0;
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_81">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvssrani_b_h-__m256i-a-__m256i-b-imm0_15-imm">__m256i __lasx_xvssrani_b_h (__m256i a, __m256i b, imm0_15 imm)</h2>
+<h3 id="synopsis_82">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvssrani_b_h (__m256i a, __m256i b, imm0_15 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvssrani.b.h xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_82">Description</h3>
+<p>Arithmetic right shift the signed 16-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in signed 8-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_82">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  if (i &lt; 8) {
+    s16 temp = (s16)b.half[i] &gt;&gt; imm;
+    dst.byte[i] = clamp&lt;s16&gt;(temp, -128, 127);
+  } else {
+    s16 temp = (s16)a.half[i - 8] &gt;&gt; imm;
+    dst.byte[i] = clamp&lt;s16&gt;(temp, -128, 127);
+  }
+}
+for (int i = 16; i &lt; 32; i++) {
+  if (i &lt; 24) {
+    s16 temp = (s16)b.half[i - 8] &gt;&gt; imm;
+    dst.byte[i] = clamp&lt;s16&gt;(temp, -128, 127);
+  } else {
+    s16 temp = (s16)a.half[i - 16] &gt;&gt; imm;
+    dst.byte[i] = clamp&lt;s16&gt;(temp, -128, 127);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_82">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvssrani_bu_h-__m256i-a-__m256i-b-imm0_15-imm">__m256i __lasx_xvssrani_bu_h (__m256i a, __m256i b, imm0_15 imm)</h2>
+<h3 id="synopsis_83">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvssrani_bu_h (__m256i a, __m256i b, imm0_15 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvssrani.bu.h xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_83">Description</h3>
+<p>Arithmetic right shift the signed 16-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in unsigned 8-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_83">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  if (i &lt; 8) {
+    s16 temp = (s16)b.half[i] &gt;&gt; imm;
+    dst.byte[i] = clamp&lt;s16&gt;(temp, 0, 255);
+  } else {
+    s16 temp = (s16)a.half[i - 8] &gt;&gt; imm;
+    dst.byte[i] = clamp&lt;s16&gt;(temp, 0, 255);
+  }
+}
+for (int i = 16; i &lt; 32; i++) {
+  if (i &lt; 24) {
+    s16 temp = (s16)b.half[i - 8] &gt;&gt; imm;
+    dst.byte[i] = clamp&lt;s16&gt;(temp, 0, 255);
+  } else {
+    s16 temp = (s16)a.half[i - 16] &gt;&gt; imm;
+    dst.byte[i] = clamp&lt;s16&gt;(temp, 0, 255);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_83">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvssrani_h_w-__m256i-a-__m256i-b-imm0_31-imm">__m256i __lasx_xvssrani_h_w (__m256i a, __m256i b, imm0_31 imm)</h2>
+<h3 id="synopsis_84">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvssrani_h_w (__m256i a, __m256i b, imm0_31 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvssrani.h.w xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_84">Description</h3>
+<p>Arithmetic right shift the signed 32-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in signed 16-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_84">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  if (i &lt; 4) {
+    s32 temp = (s32)b.word[i] &gt;&gt; imm;
+    dst.half[i] = clamp&lt;s32&gt;(temp, -32768, 32767);
+  } else {
+    s32 temp = (s32)a.word[i - 4] &gt;&gt; imm;
+    dst.half[i] = clamp&lt;s32&gt;(temp, -32768, 32767);
+  }
+}
+for (int i = 8; i &lt; 16; i++) {
+  if (i &lt; 12) {
+    s32 temp = (s32)b.word[i - 4] &gt;&gt; imm;
+    dst.half[i] = clamp&lt;s32&gt;(temp, -32768, 32767);
+  } else {
+    s32 temp = (s32)a.word[i - 8] &gt;&gt; imm;
+    dst.half[i] = clamp&lt;s32&gt;(temp, -32768, 32767);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_84">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvssrani_hu_w-__m256i-a-__m256i-b-imm0_31-imm">__m256i __lasx_xvssrani_hu_w (__m256i a, __m256i b, imm0_31 imm)</h2>
+<h3 id="synopsis_85">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvssrani_hu_w (__m256i a, __m256i b, imm0_31 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvssrani.hu.w xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_85">Description</h3>
+<p>Arithmetic right shift the signed 32-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in unsigned 16-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_85">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  if (i &lt; 4) {
+    s32 temp = (s32)b.word[i] &gt;&gt; imm;
+    dst.half[i] = clamp&lt;s32&gt;(temp, 0, 65535);
+  } else {
+    s32 temp = (s32)a.word[i - 4] &gt;&gt; imm;
+    dst.half[i] = clamp&lt;s32&gt;(temp, 0, 65535);
+  }
+}
+for (int i = 8; i &lt; 16; i++) {
+  if (i &lt; 12) {
+    s32 temp = (s32)b.word[i - 4] &gt;&gt; imm;
+    dst.half[i] = clamp&lt;s32&gt;(temp, 0, 65535);
+  } else {
+    s32 temp = (s32)a.word[i - 8] &gt;&gt; imm;
+    dst.half[i] = clamp&lt;s32&gt;(temp, 0, 65535);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_85">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvssrani_w_d-__m256i-a-__m256i-b-imm0_63-imm">__m256i __lasx_xvssrani_w_d (__m256i a, __m256i b, imm0_63 imm)</h2>
+<h3 id="synopsis_86">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvssrani_w_d (__m256i a, __m256i b, imm0_63 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvssrani.w.d xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_86">Description</h3>
+<p>Arithmetic right shift the signed 64-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in signed 32-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_86">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (i &lt; 2) {
+    s64 temp = (s64)b.dword[i] &gt;&gt; imm;
+    dst.word[i] = clamp&lt;s64&gt;(temp, -2147483648, 2147483647);
+  } else {
+    s64 temp = (s64)a.dword[i - 2] &gt;&gt; imm;
+    dst.word[i] = clamp&lt;s64&gt;(temp, -2147483648, 2147483647);
+  }
+}
+for (int i = 4; i &lt; 8; i++) {
+  if (i &lt; 6) {
+    s64 temp = (s64)b.dword[i - 2] &gt;&gt; imm;
+    dst.word[i] = clamp&lt;s64&gt;(temp, -2147483648, 2147483647);
+  } else {
+    s64 temp = (s64)a.dword[i - 4] &gt;&gt; imm;
+    dst.word[i] = clamp&lt;s64&gt;(temp, -2147483648, 2147483647);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_86">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvssrani_wu_d-__m256i-a-__m256i-b-imm0_63-imm">__m256i __lasx_xvssrani_wu_d (__m256i a, __m256i b, imm0_63 imm)</h2>
+<h3 id="synopsis_87">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvssrani_wu_d (__m256i a, __m256i b, imm0_63 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvssrani.wu.d xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_87">Description</h3>
+<p>Arithmetic right shift the signed 64-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in unsigned 32-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_87">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (i &lt; 2) {
+    s64 temp = (s64)b.dword[i] &gt;&gt; imm;
+    dst.word[i] = clamp&lt;s64&gt;(temp, 0, 4294967295);
+  } else {
+    s64 temp = (s64)a.dword[i - 2] &gt;&gt; imm;
+    dst.word[i] = clamp&lt;s64&gt;(temp, 0, 4294967295);
+  }
+}
+for (int i = 4; i &lt; 8; i++) {
+  if (i &lt; 6) {
+    s64 temp = (s64)b.dword[i - 2] &gt;&gt; imm;
+    dst.word[i] = clamp&lt;s64&gt;(temp, 0, 4294967295);
+  } else {
+    s64 temp = (s64)a.dword[i - 4] &gt;&gt; imm;
+    dst.word[i] = clamp&lt;s64&gt;(temp, 0, 4294967295);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_87">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvssrani_d_q-__m256i-a-__m256i-b-imm0_127-imm">__m256i __lasx_xvssrani_d_q (__m256i a, __m256i b, imm0_127 imm)</h2>
+<h3 id="synopsis_88">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvssrani_d_q (__m256i a, __m256i b, imm0_127 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvssrani.d.q xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_88">Description</h3>
+<p>Arithmetic right shift the signed 128-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in signed 64-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_88">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  if (i &lt; 1) {
+    s128 temp = (s128)b.qword[i] &gt;&gt; imm;
+    dst.dword[i] = clamp&lt;s128&gt;(temp, -9223372036854775808, 9223372036854775807);
+  } else {
+    s128 temp = (s128)a.qword[i - 1] &gt;&gt; imm;
+    dst.dword[i] = clamp&lt;s128&gt;(temp, -9223372036854775808, 9223372036854775807);
+  }
+}
+for (int i = 2; i &lt; 4; i++) {
+  if (i &lt; 3) {
+    s128 temp = (s128)b.qword[i - 1] &gt;&gt; imm;
+    dst.dword[i] = clamp&lt;s128&gt;(temp, -9223372036854775808, 9223372036854775807);
+  } else {
+    s128 temp = (s128)a.qword[i - 2] &gt;&gt; imm;
+    dst.dword[i] = clamp&lt;s128&gt;(temp, -9223372036854775808, 9223372036854775807);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_88">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvssrani_du_q-__m256i-a-__m256i-b-imm0_127-imm">__m256i __lasx_xvssrani_du_q (__m256i a, __m256i b, imm0_127 imm)</h2>
+<h3 id="synopsis_89">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvssrani_du_q (__m256i a, __m256i b, imm0_127 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvssrani.du.q xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_89">Description</h3>
+<p>Arithmetic right shift the signed 128-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in unsigned 64-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_89">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  if (i &lt; 1) {
+    s128 temp = (s128)b.qword[i] &gt;&gt; imm;
+    dst.dword[i] = clamp&lt;s128&gt;(temp, 0, 18446744073709551615);
+  } else {
+    s128 temp = (s128)a.qword[i - 1] &gt;&gt; imm;
+    dst.dword[i] = clamp&lt;s128&gt;(temp, 0, 18446744073709551615);
+  }
+}
+for (int i = 2; i &lt; 4; i++) {
+  if (i &lt; 3) {
+    s128 temp = (s128)b.qword[i - 1] &gt;&gt; imm;
+    dst.dword[i] = clamp&lt;s128&gt;(temp, 0, 18446744073709551615);
+  } else {
+    s128 temp = (s128)a.qword[i - 2] &gt;&gt; imm;
+    dst.dword[i] = clamp&lt;s128&gt;(temp, 0, 18446744073709551615);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_89">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvssrarn_b_h-__m256i-a-__m256i-b">__m256i __lasx_xvssrarn_b_h (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_90">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvssrarn_b_h (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvssrarn.b.h xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_90">Description</h3>
+<p>Arithmetic right shift (with rounding) the signed 16-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in signed 8-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_90">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  if (i &lt; 8) {
+    s16 temp;
+    if ((b.half[i] &amp; 15) == 0) {
+      temp = (s16)a.half[i];
+    } else {
+      temp = ((s16)a.half[i] &gt;&gt; (b.half[i] &amp; 15)) +
+             (((s16)a.half[i] &gt;&gt; ((b.half[i] &amp; 15) - 1)) &amp; 1);
+    }
+    dst.byte[i] = clamp&lt;s16&gt;(temp, -128, 127);
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+for (int i = 16; i &lt; 32; i++) {
+  if (i &lt; 24) {
+    s16 temp;
+    if ((b.half[i - 8] &amp; 15) == 0) {
+      temp = (s16)a.half[i - 8];
+    } else {
+      temp = ((s16)a.half[i - 8] &gt;&gt; (b.half[i - 8] &amp; 15)) +
+             (((s16)a.half[i - 8] &gt;&gt; ((b.half[i - 8] &amp; 15) - 1)) &amp; 1);
+    }
+    dst.byte[i] = clamp&lt;s16&gt;(temp, -128, 127);
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_90">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvssrarn_bu_h-__m256i-a-__m256i-b">__m256i __lasx_xvssrarn_bu_h (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_91">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvssrarn_bu_h (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvssrarn.bu.h xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_91">Description</h3>
+<p>Arithmetic right shift (with rounding) the signed 16-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in unsigned 8-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_91">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  if (i &lt; 8) {
+    s16 temp;
+    if ((b.half[i] &amp; 15) == 0) {
+      temp = (s16)a.half[i];
+    } else {
+      temp = ((s16)a.half[i] &gt;&gt; (b.half[i] &amp; 15)) +
+             (((s16)a.half[i] &gt;&gt; ((b.half[i] &amp; 15) - 1)) &amp; 1);
+    }
+    dst.byte[i] = clamp&lt;s16&gt;(temp, 0, 255);
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+for (int i = 16; i &lt; 32; i++) {
+  if (i &lt; 24) {
+    s16 temp;
+    if ((b.half[i - 8] &amp; 15) == 0) {
+      temp = (s16)a.half[i - 8];
+    } else {
+      temp = ((s16)a.half[i - 8] &gt;&gt; (b.half[i - 8] &amp; 15)) +
+             (((s16)a.half[i - 8] &gt;&gt; ((b.half[i - 8] &amp; 15) - 1)) &amp; 1);
+    }
+    dst.byte[i] = clamp&lt;s16&gt;(temp, 0, 255);
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_91">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvssrarn_h_w-__m256i-a-__m256i-b">__m256i __lasx_xvssrarn_h_w (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_92">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvssrarn_h_w (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvssrarn.h.w xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_92">Description</h3>
+<p>Arithmetic right shift (with rounding) the signed 32-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in signed 16-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_92">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  if (i &lt; 4) {
+    s32 temp;
+    if ((b.word[i] &amp; 31) == 0) {
+      temp = (s32)a.word[i];
+    } else {
+      temp = ((s32)a.word[i] &gt;&gt; (b.word[i] &amp; 31)) +
+             (((s32)a.word[i] &gt;&gt; ((b.word[i] &amp; 31) - 1)) &amp; 1);
+    }
+    dst.half[i] = clamp&lt;s32&gt;(temp, -32768, 32767);
+  } else {
+    dst.half[i] = 0;
+  }
+}
+for (int i = 8; i &lt; 16; i++) {
+  if (i &lt; 12) {
+    s32 temp;
+    if ((b.word[i - 4] &amp; 31) == 0) {
+      temp = (s32)a.word[i - 4];
+    } else {
+      temp = ((s32)a.word[i - 4] &gt;&gt; (b.word[i - 4] &amp; 31)) +
+             (((s32)a.word[i - 4] &gt;&gt; ((b.word[i - 4] &amp; 31) - 1)) &amp; 1);
+    }
+    dst.half[i] = clamp&lt;s32&gt;(temp, -32768, 32767);
+  } else {
+    dst.half[i] = 0;
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_92">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvssrarn_hu_w-__m256i-a-__m256i-b">__m256i __lasx_xvssrarn_hu_w (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_93">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvssrarn_hu_w (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvssrarn.hu.w xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_93">Description</h3>
+<p>Arithmetic right shift (with rounding) the signed 32-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in unsigned 16-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_93">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  if (i &lt; 4) {
+    s32 temp;
+    if ((b.word[i] &amp; 31) == 0) {
+      temp = (s32)a.word[i];
+    } else {
+      temp = ((s32)a.word[i] &gt;&gt; (b.word[i] &amp; 31)) +
+             (((s32)a.word[i] &gt;&gt; ((b.word[i] &amp; 31) - 1)) &amp; 1);
+    }
+    dst.half[i] = clamp&lt;s32&gt;(temp, 0, 65535);
+  } else {
+    dst.half[i] = 0;
+  }
+}
+for (int i = 8; i &lt; 16; i++) {
+  if (i &lt; 12) {
+    s32 temp;
+    if ((b.word[i - 4] &amp; 31) == 0) {
+      temp = (s32)a.word[i - 4];
+    } else {
+      temp = ((s32)a.word[i - 4] &gt;&gt; (b.word[i - 4] &amp; 31)) +
+             (((s32)a.word[i - 4] &gt;&gt; ((b.word[i - 4] &amp; 31) - 1)) &amp; 1);
+    }
+    dst.half[i] = clamp&lt;s32&gt;(temp, 0, 65535);
+  } else {
+    dst.half[i] = 0;
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_93">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvssrarn_w_d-__m256i-a-__m256i-b">__m256i __lasx_xvssrarn_w_d (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_94">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvssrarn_w_d (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvssrarn.w.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_94">Description</h3>
+<p>Arithmetic right shift (with rounding) the signed 64-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in signed 32-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_94">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (i &lt; 2) {
+    s64 temp;
+    if ((b.dword[i] &amp; 63) == 0) {
+      temp = (s64)a.dword[i];
+    } else {
+      temp = ((s64)a.dword[i] &gt;&gt; (b.dword[i] &amp; 63)) +
+             (((s64)a.dword[i] &gt;&gt; ((b.dword[i] &amp; 63) - 1)) &amp; 1);
+    }
+    dst.word[i] = clamp&lt;s64&gt;(temp, -2147483648, 2147483647);
+  } else {
+    dst.word[i] = 0;
+  }
+}
+for (int i = 4; i &lt; 8; i++) {
+  if (i &lt; 6) {
+    s64 temp;
+    if ((b.dword[i - 2] &amp; 63) == 0) {
+      temp = (s64)a.dword[i - 2];
+    } else {
+      temp = ((s64)a.dword[i - 2] &gt;&gt; (b.dword[i - 2] &amp; 63)) +
+             (((s64)a.dword[i - 2] &gt;&gt; ((b.dword[i - 2] &amp; 63) - 1)) &amp; 1);
+    }
+    dst.word[i] = clamp&lt;s64&gt;(temp, -2147483648, 2147483647);
+  } else {
+    dst.word[i] = 0;
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_94">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvssrarn_wu_d-__m256i-a-__m256i-b">__m256i __lasx_xvssrarn_wu_d (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_95">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvssrarn_wu_d (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvssrarn.wu.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_95">Description</h3>
+<p>Arithmetic right shift (with rounding) the signed 64-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in unsigned 32-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_95">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (i &lt; 2) {
+    s64 temp;
+    if ((b.dword[i] &amp; 63) == 0) {
+      temp = (s64)a.dword[i];
+    } else {
+      temp = ((s64)a.dword[i] &gt;&gt; (b.dword[i] &amp; 63)) +
+             (((s64)a.dword[i] &gt;&gt; ((b.dword[i] &amp; 63) - 1)) &amp; 1);
+    }
+    dst.word[i] = clamp&lt;s64&gt;(temp, 0, 4294967295);
+  } else {
+    dst.word[i] = 0;
+  }
+}
+for (int i = 4; i &lt; 8; i++) {
+  if (i &lt; 6) {
+    s64 temp;
+    if ((b.dword[i - 2] &amp; 63) == 0) {
+      temp = (s64)a.dword[i - 2];
+    } else {
+      temp = ((s64)a.dword[i - 2] &gt;&gt; (b.dword[i - 2] &amp; 63)) +
+             (((s64)a.dword[i - 2] &gt;&gt; ((b.dword[i - 2] &amp; 63) - 1)) &amp; 1);
+    }
+    dst.word[i] = clamp&lt;s64&gt;(temp, 0, 4294967295);
+  } else {
+    dst.word[i] = 0;
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_95">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvssrarni_b_h-__m256i-a-__m256i-b-imm0_15-imm">__m256i __lasx_xvssrarni_b_h (__m256i a, __m256i b, imm0_15 imm)</h2>
+<h3 id="synopsis_96">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvssrarni_b_h (__m256i a, __m256i b, imm0_15 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvssrarni.b.h xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_96">Description</h3>
+<p>Arithmetic right shift (with rounding) the signed 16-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in signed 8-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_96">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  if (i &lt; 8) {
+    s16 temp;
+    if (imm == 0) {
+      temp = (s16)b.half[i];
+    } else {
+      temp = ((s16)b.half[i] &gt;&gt; imm) + (((s16)b.half[i] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.byte[i] = clamp&lt;s16&gt;(temp, -128, 127);
+  } else {
+    s16 temp;
+    if (imm == 0) {
+      temp = (s16)a.half[i - 8];
+    } else {
+      temp =
+          ((s16)a.half[i - 8] &gt;&gt; imm) + (((s16)a.half[i - 8] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.byte[i] = clamp&lt;s16&gt;(temp, -128, 127);
+  }
+}
+for (int i = 16; i &lt; 32; i++) {
+  if (i &lt; 24) {
+    s16 temp;
+    if (imm == 0) {
+      temp = (s16)b.half[i - 8];
+    } else {
+      temp =
+          ((s16)b.half[i - 8] &gt;&gt; imm) + (((s16)b.half[i - 8] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.byte[i] = clamp&lt;s16&gt;(temp, -128, 127);
+  } else {
+    s16 temp;
+    if (imm == 0) {
+      temp = (s16)a.half[i - 16];
+    } else {
+      temp = ((s16)a.half[i - 16] &gt;&gt; imm) +
+             (((s16)a.half[i - 16] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.byte[i] = clamp&lt;s16&gt;(temp, -128, 127);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_96">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvssrarni_bu_h-__m256i-a-__m256i-b-imm0_15-imm">__m256i __lasx_xvssrarni_bu_h (__m256i a, __m256i b, imm0_15 imm)</h2>
+<h3 id="synopsis_97">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvssrarni_bu_h (__m256i a, __m256i b, imm0_15 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvssrarni.bu.h xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_97">Description</h3>
+<p>Arithmetic right shift (with rounding) the signed 16-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in unsigned 8-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_97">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  if (i &lt; 8) {
+    s16 temp;
+    if (imm == 0) {
+      temp = (s16)b.half[i];
+    } else {
+      temp = ((s16)b.half[i] &gt;&gt; imm) + (((s16)b.half[i] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.byte[i] = clamp&lt;s16&gt;(temp, 0, 255);
+  } else {
+    s16 temp;
+    if (imm == 0) {
+      temp = (s16)a.half[i - 8];
+    } else {
+      temp =
+          ((s16)a.half[i - 8] &gt;&gt; imm) + (((s16)a.half[i - 8] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.byte[i] = clamp&lt;s16&gt;(temp, 0, 255);
+  }
+}
+for (int i = 16; i &lt; 32; i++) {
+  if (i &lt; 24) {
+    s16 temp;
+    if (imm == 0) {
+      temp = (s16)b.half[i - 8];
+    } else {
+      temp =
+          ((s16)b.half[i - 8] &gt;&gt; imm) + (((s16)b.half[i - 8] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.byte[i] = clamp&lt;s16&gt;(temp, 0, 255);
+  } else {
+    s16 temp;
+    if (imm == 0) {
+      temp = (s16)a.half[i - 16];
+    } else {
+      temp = ((s16)a.half[i - 16] &gt;&gt; imm) +
+             (((s16)a.half[i - 16] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.byte[i] = clamp&lt;s16&gt;(temp, 0, 255);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_97">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvssrarni_h_w-__m256i-a-__m256i-b-imm0_31-imm">__m256i __lasx_xvssrarni_h_w (__m256i a, __m256i b, imm0_31 imm)</h2>
+<h3 id="synopsis_98">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvssrarni_h_w (__m256i a, __m256i b, imm0_31 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvssrarni.h.w xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_98">Description</h3>
+<p>Arithmetic right shift (with rounding) the signed 32-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in signed 16-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_98">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  if (i &lt; 4) {
+    s32 temp;
+    if (imm == 0) {
+      temp = (s32)b.word[i];
+    } else {
+      temp = ((s32)b.word[i] &gt;&gt; imm) + (((s32)b.word[i] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.half[i] = clamp&lt;s32&gt;(temp, -32768, 32767);
+  } else {
+    s32 temp;
+    if (imm == 0) {
+      temp = (s32)a.word[i - 4];
+    } else {
+      temp =
+          ((s32)a.word[i - 4] &gt;&gt; imm) + (((s32)a.word[i - 4] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.half[i] = clamp&lt;s32&gt;(temp, -32768, 32767);
+  }
+}
+for (int i = 8; i &lt; 16; i++) {
+  if (i &lt; 12) {
+    s32 temp;
+    if (imm == 0) {
+      temp = (s32)b.word[i - 4];
+    } else {
+      temp =
+          ((s32)b.word[i - 4] &gt;&gt; imm) + (((s32)b.word[i - 4] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.half[i] = clamp&lt;s32&gt;(temp, -32768, 32767);
+  } else {
+    s32 temp;
+    if (imm == 0) {
+      temp = (s32)a.word[i - 8];
+    } else {
+      temp =
+          ((s32)a.word[i - 8] &gt;&gt; imm) + (((s32)a.word[i - 8] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.half[i] = clamp&lt;s32&gt;(temp, -32768, 32767);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_98">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvssrarni_hu_w-__m256i-a-__m256i-b-imm0_31-imm">__m256i __lasx_xvssrarni_hu_w (__m256i a, __m256i b, imm0_31 imm)</h2>
+<h3 id="synopsis_99">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvssrarni_hu_w (__m256i a, __m256i b, imm0_31 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvssrarni.hu.w xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_99">Description</h3>
+<p>Arithmetic right shift (with rounding) the signed 32-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in unsigned 16-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_99">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  if (i &lt; 4) {
+    s32 temp;
+    if (imm == 0) {
+      temp = (s32)b.word[i];
+    } else {
+      temp = ((s32)b.word[i] &gt;&gt; imm) + (((s32)b.word[i] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.half[i] = clamp&lt;s32&gt;(temp, 0, 65535);
+  } else {
+    s32 temp;
+    if (imm == 0) {
+      temp = (s32)a.word[i - 4];
+    } else {
+      temp =
+          ((s32)a.word[i - 4] &gt;&gt; imm) + (((s32)a.word[i - 4] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.half[i] = clamp&lt;s32&gt;(temp, 0, 65535);
+  }
+}
+for (int i = 8; i &lt; 16; i++) {
+  if (i &lt; 12) {
+    s32 temp;
+    if (imm == 0) {
+      temp = (s32)b.word[i - 4];
+    } else {
+      temp =
+          ((s32)b.word[i - 4] &gt;&gt; imm) + (((s32)b.word[i - 4] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.half[i] = clamp&lt;s32&gt;(temp, 0, 65535);
+  } else {
+    s32 temp;
+    if (imm == 0) {
+      temp = (s32)a.word[i - 8];
+    } else {
+      temp =
+          ((s32)a.word[i - 8] &gt;&gt; imm) + (((s32)a.word[i - 8] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.half[i] = clamp&lt;s32&gt;(temp, 0, 65535);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_99">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvssrarni_w_d-__m256i-a-__m256i-b-imm0_63-imm">__m256i __lasx_xvssrarni_w_d (__m256i a, __m256i b, imm0_63 imm)</h2>
+<h3 id="synopsis_100">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvssrarni_w_d (__m256i a, __m256i b, imm0_63 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvssrarni.w.d xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_100">Description</h3>
+<p>Arithmetic right shift (with rounding) the signed 64-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in signed 32-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_100">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (i &lt; 2) {
+    s64 temp;
+    if (imm == 0) {
+      temp = (s64)b.dword[i];
+    } else {
+      temp = ((s64)b.dword[i] &gt;&gt; imm) + (((s64)b.dword[i] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.word[i] = clamp&lt;s64&gt;(temp, -2147483648, 2147483647);
+  } else {
+    s64 temp;
+    if (imm == 0) {
+      temp = (s64)a.dword[i - 2];
+    } else {
+      temp = ((s64)a.dword[i - 2] &gt;&gt; imm) +
+             (((s64)a.dword[i - 2] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.word[i] = clamp&lt;s64&gt;(temp, -2147483648, 2147483647);
+  }
+}
+for (int i = 4; i &lt; 8; i++) {
+  if (i &lt; 6) {
+    s64 temp;
+    if (imm == 0) {
+      temp = (s64)b.dword[i - 2];
+    } else {
+      temp = ((s64)b.dword[i - 2] &gt;&gt; imm) +
+             (((s64)b.dword[i - 2] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.word[i] = clamp&lt;s64&gt;(temp, -2147483648, 2147483647);
+  } else {
+    s64 temp;
+    if (imm == 0) {
+      temp = (s64)a.dword[i - 4];
+    } else {
+      temp = ((s64)a.dword[i - 4] &gt;&gt; imm) +
+             (((s64)a.dword[i - 4] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.word[i] = clamp&lt;s64&gt;(temp, -2147483648, 2147483647);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_100">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvssrarni_wu_d-__m256i-a-__m256i-b-imm0_63-imm">__m256i __lasx_xvssrarni_wu_d (__m256i a, __m256i b, imm0_63 imm)</h2>
+<h3 id="synopsis_101">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvssrarni_wu_d (__m256i a, __m256i b, imm0_63 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvssrarni.wu.d xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_101">Description</h3>
+<p>Arithmetic right shift (with rounding) the signed 64-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in unsigned 32-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_101">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (i &lt; 2) {
+    s64 temp;
+    if (imm == 0) {
+      temp = (s64)b.dword[i];
+    } else {
+      temp = ((s64)b.dword[i] &gt;&gt; imm) + (((s64)b.dword[i] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.word[i] = clamp&lt;s64&gt;(temp, 0, 4294967295);
+  } else {
+    s64 temp;
+    if (imm == 0) {
+      temp = (s64)a.dword[i - 2];
+    } else {
+      temp = ((s64)a.dword[i - 2] &gt;&gt; imm) +
+             (((s64)a.dword[i - 2] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.word[i] = clamp&lt;s64&gt;(temp, 0, 4294967295);
+  }
+}
+for (int i = 4; i &lt; 8; i++) {
+  if (i &lt; 6) {
+    s64 temp;
+    if (imm == 0) {
+      temp = (s64)b.dword[i - 2];
+    } else {
+      temp = ((s64)b.dword[i - 2] &gt;&gt; imm) +
+             (((s64)b.dword[i - 2] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.word[i] = clamp&lt;s64&gt;(temp, 0, 4294967295);
+  } else {
+    s64 temp;
+    if (imm == 0) {
+      temp = (s64)a.dword[i - 4];
+    } else {
+      temp = ((s64)a.dword[i - 4] &gt;&gt; imm) +
+             (((s64)a.dword[i - 4] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.word[i] = clamp&lt;s64&gt;(temp, 0, 4294967295);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_101">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvssrarni_d_q-__m256i-a-__m256i-b-imm0_127-imm">__m256i __lasx_xvssrarni_d_q (__m256i a, __m256i b, imm0_127 imm)</h2>
+<h3 id="synopsis_102">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvssrarni_d_q (__m256i a, __m256i b, imm0_127 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvssrarni.d.q xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_102">Description</h3>
+<p>Arithmetic right shift (with rounding) the signed 128-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in signed 64-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_102">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  if (i &lt; 1) {
+    s128 temp;
+    if (imm == 0) {
+      temp = (s128)b.qword[i];
+    } else {
+      temp = ((s128)b.qword[i] &gt;&gt; imm) + (((s128)b.qword[i] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.dword[i] = clamp&lt;s128&gt;(temp, -9223372036854775808, 9223372036854775807);
+  } else {
+    s128 temp;
+    if (imm == 0) {
+      temp = (s128)a.qword[i - 1];
+    } else {
+      temp = ((s128)a.qword[i - 1] &gt;&gt; imm) +
+             (((s128)a.qword[i - 1] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.dword[i] = clamp&lt;s128&gt;(temp, -9223372036854775808, 9223372036854775807);
+  }
+}
+for (int i = 2; i &lt; 4; i++) {
+  if (i &lt; 3) {
+    s128 temp;
+    if (imm == 0) {
+      temp = (s128)b.qword[i - 1];
+    } else {
+      temp = ((s128)b.qword[i - 1] &gt;&gt; imm) +
+             (((s128)b.qword[i - 1] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.dword[i] = clamp&lt;s128&gt;(temp, -9223372036854775808, 9223372036854775807);
+  } else {
+    s128 temp;
+    if (imm == 0) {
+      temp = (s128)a.qword[i - 2];
+    } else {
+      temp = ((s128)a.qword[i - 2] &gt;&gt; imm) +
+             (((s128)a.qword[i - 2] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.dword[i] = clamp&lt;s128&gt;(temp, -9223372036854775808, 9223372036854775807);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_102">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvssrarni_du_q-__m256i-a-__m256i-b-imm0_127-imm">__m256i __lasx_xvssrarni_du_q (__m256i a, __m256i b, imm0_127 imm)</h2>
+<h3 id="synopsis_103">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvssrarni_du_q (__m256i a, __m256i b, imm0_127 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvssrarni.du.q xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_103">Description</h3>
+<p>Arithmetic right shift (with rounding) the signed 128-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in unsigned 64-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_103">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  if (i &lt; 1) {
+    s128 temp;
+    if (imm == 0) {
+      temp = (s128)b.qword[i];
+    } else {
+      temp = ((s128)b.qword[i] &gt;&gt; imm) + (((s128)b.qword[i] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.dword[i] = clamp&lt;s128&gt;(temp, 0, 18446744073709551615);
+  } else {
+    s128 temp;
+    if (imm == 0) {
+      temp = (s128)a.qword[i - 1];
+    } else {
+      temp = ((s128)a.qword[i - 1] &gt;&gt; imm) +
+             (((s128)a.qword[i - 1] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.dword[i] = clamp&lt;s128&gt;(temp, 0, 18446744073709551615);
+  }
+}
+for (int i = 2; i &lt; 4; i++) {
+  if (i &lt; 3) {
+    s128 temp;
+    if (imm == 0) {
+      temp = (s128)b.qword[i - 1];
+    } else {
+      temp = ((s128)b.qword[i - 1] &gt;&gt; imm) +
+             (((s128)b.qword[i - 1] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.dword[i] = clamp&lt;s128&gt;(temp, 0, 18446744073709551615);
+  } else {
+    s128 temp;
+    if (imm == 0) {
+      temp = (s128)a.qword[i - 2];
+    } else {
+      temp = ((s128)a.qword[i - 2] &gt;&gt; imm) +
+             (((s128)a.qword[i - 2] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.dword[i] = clamp&lt;s128&gt;(temp, 0, 18446744073709551615);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_103">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvssrln_b_h-__m256i-a-__m256i-b">__m256i __lasx_xvssrln_b_h (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_104">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvssrln_b_h (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvssrln.b.h xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_104">Description</h3>
+<p>Logical right shift the unsigned 16-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in signed 8-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_104">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  if (i &lt; 8) {
+    u16 temp = (u16)a.half[i] &gt;&gt; (b.half[i] &amp; 15);
+    dst.byte[i] = clamp&lt;u16&gt;(temp, 0, 127);
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+for (int i = 16; i &lt; 32; i++) {
+  if (i &lt; 24) {
+    u16 temp = (u16)a.half[i - 8] &gt;&gt; (b.half[i - 8] &amp; 15);
+    dst.byte[i] = clamp&lt;u16&gt;(temp, 0, 127);
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_104">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvssrln_bu_h-__m256i-a-__m256i-b">__m256i __lasx_xvssrln_bu_h (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_105">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvssrln_bu_h (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvssrln.bu.h xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_105">Description</h3>
+<p>Logical right shift the unsigned 16-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in unsigned 8-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_105">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  if (i &lt; 8) {
+    u16 temp = (u16)a.half[i] &gt;&gt; (b.half[i] &amp; 15);
+    dst.byte[i] = clamp&lt;u16&gt;(temp, 0, 255);
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+for (int i = 16; i &lt; 32; i++) {
+  if (i &lt; 24) {
+    u16 temp = (u16)a.half[i - 8] &gt;&gt; (b.half[i - 8] &amp; 15);
+    dst.byte[i] = clamp&lt;u16&gt;(temp, 0, 255);
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_105">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvssrln_h_w-__m256i-a-__m256i-b">__m256i __lasx_xvssrln_h_w (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_106">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvssrln_h_w (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvssrln.h.w xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_106">Description</h3>
+<p>Logical right shift the unsigned 32-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in signed 16-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_106">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  if (i &lt; 4) {
+    u32 temp = (u32)a.word[i] &gt;&gt; (b.word[i] &amp; 31);
+    dst.half[i] = clamp&lt;u32&gt;(temp, 0, 32767);
+  } else {
+    dst.half[i] = 0;
+  }
+}
+for (int i = 8; i &lt; 16; i++) {
+  if (i &lt; 12) {
+    u32 temp = (u32)a.word[i - 4] &gt;&gt; (b.word[i - 4] &amp; 31);
+    dst.half[i] = clamp&lt;u32&gt;(temp, 0, 32767);
+  } else {
+    dst.half[i] = 0;
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_106">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvssrln_hu_w-__m256i-a-__m256i-b">__m256i __lasx_xvssrln_hu_w (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_107">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvssrln_hu_w (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvssrln.hu.w xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_107">Description</h3>
+<p>Logical right shift the unsigned 32-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in unsigned 16-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_107">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  if (i &lt; 4) {
+    u32 temp = (u32)a.word[i] &gt;&gt; (b.word[i] &amp; 31);
+    dst.half[i] = clamp&lt;u32&gt;(temp, 0, 65535);
+  } else {
+    dst.half[i] = 0;
+  }
+}
+for (int i = 8; i &lt; 16; i++) {
+  if (i &lt; 12) {
+    u32 temp = (u32)a.word[i - 4] &gt;&gt; (b.word[i - 4] &amp; 31);
+    dst.half[i] = clamp&lt;u32&gt;(temp, 0, 65535);
+  } else {
+    dst.half[i] = 0;
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_107">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvssrln_w_d-__m256i-a-__m256i-b">__m256i __lasx_xvssrln_w_d (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_108">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvssrln_w_d (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvssrln.w.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_108">Description</h3>
+<p>Logical right shift the unsigned 64-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in signed 32-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_108">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (i &lt; 2) {
+    u64 temp = (u64)a.dword[i] &gt;&gt; (b.dword[i] &amp; 63);
+    dst.word[i] = clamp&lt;u64&gt;(temp, 0, 2147483647);
+  } else {
+    dst.word[i] = 0;
+  }
+}
+for (int i = 4; i &lt; 8; i++) {
+  if (i &lt; 6) {
+    u64 temp = (u64)a.dword[i - 2] &gt;&gt; (b.dword[i - 2] &amp; 63);
+    dst.word[i] = clamp&lt;u64&gt;(temp, 0, 2147483647);
+  } else {
+    dst.word[i] = 0;
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_108">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvssrln_wu_d-__m256i-a-__m256i-b">__m256i __lasx_xvssrln_wu_d (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_109">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvssrln_wu_d (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvssrln.wu.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_109">Description</h3>
+<p>Logical right shift the unsigned 64-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in unsigned 32-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_109">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (i &lt; 2) {
+    u64 temp = (u64)a.dword[i] &gt;&gt; (b.dword[i] &amp; 63);
+    dst.word[i] = clamp&lt;u64&gt;(temp, 0, 4294967295);
+  } else {
+    dst.word[i] = 0;
+  }
+}
+for (int i = 4; i &lt; 8; i++) {
+  if (i &lt; 6) {
+    u64 temp = (u64)a.dword[i - 2] &gt;&gt; (b.dword[i - 2] &amp; 63);
+    dst.word[i] = clamp&lt;u64&gt;(temp, 0, 4294967295);
+  } else {
+    dst.word[i] = 0;
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_109">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvssrlni_b_h-__m256i-a-__m256i-b-imm0_15-imm">__m256i __lasx_xvssrlni_b_h (__m256i a, __m256i b, imm0_15 imm)</h2>
+<h3 id="synopsis_110">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvssrlni_b_h (__m256i a, __m256i b, imm0_15 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvssrlni.b.h xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_110">Description</h3>
+<p>Logical right shift the unsigned 16-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in signed 8-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_110">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  if (i &lt; 8) {
+    u16 temp = (u16)b.half[i] &gt;&gt; imm;
+    dst.byte[i] = clamp&lt;u16&gt;(temp, 0, 127);
+  } else {
+    u16 temp = (u16)a.half[i - 8] &gt;&gt; imm;
+    dst.byte[i] = clamp&lt;u16&gt;(temp, 0, 127);
+  }
+}
+for (int i = 16; i &lt; 32; i++) {
+  if (i &lt; 24) {
+    u16 temp = (u16)b.half[i - 8] &gt;&gt; imm;
+    dst.byte[i] = clamp&lt;u16&gt;(temp, 0, 127);
+  } else {
+    u16 temp = (u16)a.half[i - 16] &gt;&gt; imm;
+    dst.byte[i] = clamp&lt;u16&gt;(temp, 0, 127);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_110">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvssrlni_bu_h-__m256i-a-__m256i-b-imm0_15-imm">__m256i __lasx_xvssrlni_bu_h (__m256i a, __m256i b, imm0_15 imm)</h2>
+<h3 id="synopsis_111">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvssrlni_bu_h (__m256i a, __m256i b, imm0_15 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvssrlni.bu.h xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_111">Description</h3>
+<p>Logical right shift the unsigned 16-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in unsigned 8-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_111">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  if (i &lt; 8) {
+    u16 temp = (u16)b.half[i] &gt;&gt; imm;
+    dst.byte[i] = clamp&lt;u16&gt;(temp, 0, 255);
+  } else {
+    u16 temp = (u16)a.half[i - 8] &gt;&gt; imm;
+    dst.byte[i] = clamp&lt;u16&gt;(temp, 0, 255);
+  }
+}
+for (int i = 16; i &lt; 32; i++) {
+  if (i &lt; 24) {
+    u16 temp = (u16)b.half[i - 8] &gt;&gt; imm;
+    dst.byte[i] = clamp&lt;u16&gt;(temp, 0, 255);
+  } else {
+    u16 temp = (u16)a.half[i - 16] &gt;&gt; imm;
+    dst.byte[i] = clamp&lt;u16&gt;(temp, 0, 255);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_111">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvssrlni_h_w-__m256i-a-__m256i-b-imm0_31-imm">__m256i __lasx_xvssrlni_h_w (__m256i a, __m256i b, imm0_31 imm)</h2>
+<h3 id="synopsis_112">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvssrlni_h_w (__m256i a, __m256i b, imm0_31 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvssrlni.h.w xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_112">Description</h3>
+<p>Logical right shift the unsigned 32-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in signed 16-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_112">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  if (i &lt; 4) {
+    u32 temp = (u32)b.word[i] &gt;&gt; imm;
+    dst.half[i] = clamp&lt;u32&gt;(temp, 0, 32767);
+  } else {
+    u32 temp = (u32)a.word[i - 4] &gt;&gt; imm;
+    dst.half[i] = clamp&lt;u32&gt;(temp, 0, 32767);
+  }
+}
+for (int i = 8; i &lt; 16; i++) {
+  if (i &lt; 12) {
+    u32 temp = (u32)b.word[i - 4] &gt;&gt; imm;
+    dst.half[i] = clamp&lt;u32&gt;(temp, 0, 32767);
+  } else {
+    u32 temp = (u32)a.word[i - 8] &gt;&gt; imm;
+    dst.half[i] = clamp&lt;u32&gt;(temp, 0, 32767);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_112">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvssrlni_hu_w-__m256i-a-__m256i-b-imm0_31-imm">__m256i __lasx_xvssrlni_hu_w (__m256i a, __m256i b, imm0_31 imm)</h2>
+<h3 id="synopsis_113">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvssrlni_hu_w (__m256i a, __m256i b, imm0_31 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvssrlni.hu.w xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_113">Description</h3>
+<p>Logical right shift the unsigned 32-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in unsigned 16-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_113">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  if (i &lt; 4) {
+    u32 temp = (u32)b.word[i] &gt;&gt; imm;
+    dst.half[i] = clamp&lt;u32&gt;(temp, 0, 65535);
+  } else {
+    u32 temp = (u32)a.word[i - 4] &gt;&gt; imm;
+    dst.half[i] = clamp&lt;u32&gt;(temp, 0, 65535);
+  }
+}
+for (int i = 8; i &lt; 16; i++) {
+  if (i &lt; 12) {
+    u32 temp = (u32)b.word[i - 4] &gt;&gt; imm;
+    dst.half[i] = clamp&lt;u32&gt;(temp, 0, 65535);
+  } else {
+    u32 temp = (u32)a.word[i - 8] &gt;&gt; imm;
+    dst.half[i] = clamp&lt;u32&gt;(temp, 0, 65535);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_113">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvssrlni_w_d-__m256i-a-__m256i-b-imm0_63-imm">__m256i __lasx_xvssrlni_w_d (__m256i a, __m256i b, imm0_63 imm)</h2>
+<h3 id="synopsis_114">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvssrlni_w_d (__m256i a, __m256i b, imm0_63 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvssrlni.w.d xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_114">Description</h3>
+<p>Logical right shift the unsigned 64-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in signed 32-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_114">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (i &lt; 2) {
+    u64 temp = (u64)b.dword[i] &gt;&gt; imm;
+    dst.word[i] = clamp&lt;u64&gt;(temp, 0, 2147483647);
+  } else {
+    u64 temp = (u64)a.dword[i - 2] &gt;&gt; imm;
+    dst.word[i] = clamp&lt;u64&gt;(temp, 0, 2147483647);
+  }
+}
+for (int i = 4; i &lt; 8; i++) {
+  if (i &lt; 6) {
+    u64 temp = (u64)b.dword[i - 2] &gt;&gt; imm;
+    dst.word[i] = clamp&lt;u64&gt;(temp, 0, 2147483647);
+  } else {
+    u64 temp = (u64)a.dword[i - 4] &gt;&gt; imm;
+    dst.word[i] = clamp&lt;u64&gt;(temp, 0, 2147483647);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_114">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvssrlni_wu_d-__m256i-a-__m256i-b-imm0_63-imm">__m256i __lasx_xvssrlni_wu_d (__m256i a, __m256i b, imm0_63 imm)</h2>
+<h3 id="synopsis_115">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvssrlni_wu_d (__m256i a, __m256i b, imm0_63 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvssrlni.wu.d xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_115">Description</h3>
+<p>Logical right shift the unsigned 64-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in unsigned 32-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_115">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (i &lt; 2) {
+    u64 temp = (u64)b.dword[i] &gt;&gt; imm;
+    dst.word[i] = clamp&lt;u64&gt;(temp, 0, 4294967295);
+  } else {
+    u64 temp = (u64)a.dword[i - 2] &gt;&gt; imm;
+    dst.word[i] = clamp&lt;u64&gt;(temp, 0, 4294967295);
+  }
+}
+for (int i = 4; i &lt; 8; i++) {
+  if (i &lt; 6) {
+    u64 temp = (u64)b.dword[i - 2] &gt;&gt; imm;
+    dst.word[i] = clamp&lt;u64&gt;(temp, 0, 4294967295);
+  } else {
+    u64 temp = (u64)a.dword[i - 4] &gt;&gt; imm;
+    dst.word[i] = clamp&lt;u64&gt;(temp, 0, 4294967295);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_115">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvssrlni_d_q-__m256i-a-__m256i-b-imm0_127-imm">__m256i __lasx_xvssrlni_d_q (__m256i a, __m256i b, imm0_127 imm)</h2>
+<h3 id="synopsis_116">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvssrlni_d_q (__m256i a, __m256i b, imm0_127 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvssrlni.d.q xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_116">Description</h3>
+<p>Logical right shift the unsigned 128-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in signed 64-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_116">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  if (i &lt; 1) {
+    u128 temp = (u128)b.qword[i] &gt;&gt; imm;
+    dst.dword[i] = clamp&lt;u128&gt;(temp, 0, 9223372036854775807);
+  } else {
+    u128 temp = (u128)a.qword[i - 1] &gt;&gt; imm;
+    dst.dword[i] = clamp&lt;u128&gt;(temp, 0, 9223372036854775807);
+  }
+}
+for (int i = 2; i &lt; 4; i++) {
+  if (i &lt; 3) {
+    u128 temp = (u128)b.qword[i - 1] &gt;&gt; imm;
+    dst.dword[i] = clamp&lt;u128&gt;(temp, 0, 9223372036854775807);
+  } else {
+    u128 temp = (u128)a.qword[i - 2] &gt;&gt; imm;
+    dst.dword[i] = clamp&lt;u128&gt;(temp, 0, 9223372036854775807);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_116">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvssrlni_du_q-__m256i-a-__m256i-b-imm0_127-imm">__m256i __lasx_xvssrlni_du_q (__m256i a, __m256i b, imm0_127 imm)</h2>
+<h3 id="synopsis_117">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvssrlni_du_q (__m256i a, __m256i b, imm0_127 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvssrlni.du.q xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_117">Description</h3>
+<p>Logical right shift the unsigned 128-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in unsigned 64-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_117">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  if (i &lt; 1) {
+    u128 temp = (u128)b.qword[i] &gt;&gt; imm;
+    dst.dword[i] = clamp&lt;u128&gt;(temp, 0, 18446744073709551615);
+  } else {
+    u128 temp = (u128)a.qword[i - 1] &gt;&gt; imm;
+    dst.dword[i] = clamp&lt;u128&gt;(temp, 0, 18446744073709551615);
+  }
+}
+for (int i = 2; i &lt; 4; i++) {
+  if (i &lt; 3) {
+    u128 temp = (u128)b.qword[i - 1] &gt;&gt; imm;
+    dst.dword[i] = clamp&lt;u128&gt;(temp, 0, 18446744073709551615);
+  } else {
+    u128 temp = (u128)a.qword[i - 2] &gt;&gt; imm;
+    dst.dword[i] = clamp&lt;u128&gt;(temp, 0, 18446744073709551615);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_117">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvssrlrn_b_h-__m256i-a-__m256i-b">__m256i __lasx_xvssrlrn_b_h (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_118">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvssrlrn_b_h (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvssrlrn.b.h xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_118">Description</h3>
+<p>Logical right shift (with rounding) the unsigned 16-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in signed 8-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_118">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  if (i &lt; 8) {
+    u16 temp;
+    if ((b.half[i] &amp; 15) == 0) {
+      temp = (u16)a.half[i];
+    } else {
+      temp = ((u16)a.half[i] &gt;&gt; (b.half[i] &amp; 15)) +
+             (((u16)a.half[i] &gt;&gt; ((b.half[i] &amp; 15) - 1)) &amp; 1);
+    }
+    dst.byte[i] = clamp&lt;u16&gt;(temp, 0, 127);
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+for (int i = 16; i &lt; 32; i++) {
+  if (i &lt; 24) {
+    u16 temp;
+    if ((b.half[i - 8] &amp; 15) == 0) {
+      temp = (u16)a.half[i - 8];
+    } else {
+      temp = ((u16)a.half[i - 8] &gt;&gt; (b.half[i - 8] &amp; 15)) +
+             (((u16)a.half[i - 8] &gt;&gt; ((b.half[i - 8] &amp; 15) - 1)) &amp; 1);
+    }
+    dst.byte[i] = clamp&lt;u16&gt;(temp, 0, 127);
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_118">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvssrlrn_bu_h-__m256i-a-__m256i-b">__m256i __lasx_xvssrlrn_bu_h (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_119">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvssrlrn_bu_h (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvssrlrn.bu.h xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_119">Description</h3>
+<p>Logical right shift (with rounding) the unsigned 16-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in unsigned 8-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_119">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  if (i &lt; 8) {
+    u16 temp;
+    if ((b.half[i] &amp; 15) == 0) {
+      temp = (u16)a.half[i];
+    } else {
+      temp = ((u16)a.half[i] &gt;&gt; (b.half[i] &amp; 15)) +
+             (((u16)a.half[i] &gt;&gt; ((b.half[i] &amp; 15) - 1)) &amp; 1);
+    }
+    dst.byte[i] = clamp&lt;u16&gt;(temp, 0, 255);
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+for (int i = 16; i &lt; 32; i++) {
+  if (i &lt; 24) {
+    u16 temp;
+    if ((b.half[i - 8] &amp; 15) == 0) {
+      temp = (u16)a.half[i - 8];
+    } else {
+      temp = ((u16)a.half[i - 8] &gt;&gt; (b.half[i - 8] &amp; 15)) +
+             (((u16)a.half[i - 8] &gt;&gt; ((b.half[i - 8] &amp; 15) - 1)) &amp; 1);
+    }
+    dst.byte[i] = clamp&lt;u16&gt;(temp, 0, 255);
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_119">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvssrlrn_h_w-__m256i-a-__m256i-b">__m256i __lasx_xvssrlrn_h_w (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_120">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvssrlrn_h_w (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvssrlrn.h.w xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_120">Description</h3>
+<p>Logical right shift (with rounding) the unsigned 32-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in signed 16-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_120">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  if (i &lt; 4) {
+    u32 temp;
+    if ((b.word[i] &amp; 31) == 0) {
+      temp = (u32)a.word[i];
+    } else {
+      temp = ((u32)a.word[i] &gt;&gt; (b.word[i] &amp; 31)) +
+             (((u32)a.word[i] &gt;&gt; ((b.word[i] &amp; 31) - 1)) &amp; 1);
+    }
+    dst.half[i] = clamp&lt;u32&gt;(temp, 0, 32767);
+  } else {
+    dst.half[i] = 0;
+  }
+}
+for (int i = 8; i &lt; 16; i++) {
+  if (i &lt; 12) {
+    u32 temp;
+    if ((b.word[i - 4] &amp; 31) == 0) {
+      temp = (u32)a.word[i - 4];
+    } else {
+      temp = ((u32)a.word[i - 4] &gt;&gt; (b.word[i - 4] &amp; 31)) +
+             (((u32)a.word[i - 4] &gt;&gt; ((b.word[i - 4] &amp; 31) - 1)) &amp; 1);
+    }
+    dst.half[i] = clamp&lt;u32&gt;(temp, 0, 32767);
+  } else {
+    dst.half[i] = 0;
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_120">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvssrlrn_hu_w-__m256i-a-__m256i-b">__m256i __lasx_xvssrlrn_hu_w (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_121">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvssrlrn_hu_w (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvssrlrn.hu.w xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_121">Description</h3>
+<p>Logical right shift (with rounding) the unsigned 32-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in unsigned 16-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_121">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  if (i &lt; 4) {
+    u32 temp;
+    if ((b.word[i] &amp; 31) == 0) {
+      temp = (u32)a.word[i];
+    } else {
+      temp = ((u32)a.word[i] &gt;&gt; (b.word[i] &amp; 31)) +
+             (((u32)a.word[i] &gt;&gt; ((b.word[i] &amp; 31) - 1)) &amp; 1);
+    }
+    dst.half[i] = clamp&lt;u32&gt;(temp, 0, 65535);
+  } else {
+    dst.half[i] = 0;
+  }
+}
+for (int i = 8; i &lt; 16; i++) {
+  if (i &lt; 12) {
+    u32 temp;
+    if ((b.word[i - 4] &amp; 31) == 0) {
+      temp = (u32)a.word[i - 4];
+    } else {
+      temp = ((u32)a.word[i - 4] &gt;&gt; (b.word[i - 4] &amp; 31)) +
+             (((u32)a.word[i - 4] &gt;&gt; ((b.word[i - 4] &amp; 31) - 1)) &amp; 1);
+    }
+    dst.half[i] = clamp&lt;u32&gt;(temp, 0, 65535);
+  } else {
+    dst.half[i] = 0;
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_121">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvssrlrn_w_d-__m256i-a-__m256i-b">__m256i __lasx_xvssrlrn_w_d (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_122">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvssrlrn_w_d (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvssrlrn.w.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_122">Description</h3>
+<p>Logical right shift (with rounding) the unsigned 64-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in signed 32-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_122">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (i &lt; 2) {
+    u64 temp;
+    if ((b.dword[i] &amp; 63) == 0) {
+      temp = (u64)a.dword[i];
+    } else {
+      temp = ((u64)a.dword[i] &gt;&gt; (b.dword[i] &amp; 63)) +
+             (((u64)a.dword[i] &gt;&gt; ((b.dword[i] &amp; 63) - 1)) &amp; 1);
+    }
+    dst.word[i] = clamp&lt;u64&gt;(temp, 0, 2147483647);
+  } else {
+    dst.word[i] = 0;
+  }
+}
+for (int i = 4; i &lt; 8; i++) {
+  if (i &lt; 6) {
+    u64 temp;
+    if ((b.dword[i - 2] &amp; 63) == 0) {
+      temp = (u64)a.dword[i - 2];
+    } else {
+      temp = ((u64)a.dword[i - 2] &gt;&gt; (b.dword[i - 2] &amp; 63)) +
+             (((u64)a.dword[i - 2] &gt;&gt; ((b.dword[i - 2] &amp; 63) - 1)) &amp; 1);
+    }
+    dst.word[i] = clamp&lt;u64&gt;(temp, 0, 2147483647);
+  } else {
+    dst.word[i] = 0;
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_122">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvssrlrn_wu_d-__m256i-a-__m256i-b">__m256i __lasx_xvssrlrn_wu_d (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_123">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvssrlrn_wu_d (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvssrlrn.wu.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_123">Description</h3>
+<p>Logical right shift (with rounding) the unsigned 64-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in unsigned 32-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_123">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (i &lt; 2) {
+    u64 temp;
+    if ((b.dword[i] &amp; 63) == 0) {
+      temp = (u64)a.dword[i];
+    } else {
+      temp = ((u64)a.dword[i] &gt;&gt; (b.dword[i] &amp; 63)) +
+             (((u64)a.dword[i] &gt;&gt; ((b.dword[i] &amp; 63) - 1)) &amp; 1);
+    }
+    dst.word[i] = clamp&lt;u64&gt;(temp, 0, 4294967295);
+  } else {
+    dst.word[i] = 0;
+  }
+}
+for (int i = 4; i &lt; 8; i++) {
+  if (i &lt; 6) {
+    u64 temp;
+    if ((b.dword[i - 2] &amp; 63) == 0) {
+      temp = (u64)a.dword[i - 2];
+    } else {
+      temp = ((u64)a.dword[i - 2] &gt;&gt; (b.dword[i - 2] &amp; 63)) +
+             (((u64)a.dword[i - 2] &gt;&gt; ((b.dword[i - 2] &amp; 63) - 1)) &amp; 1);
+    }
+    dst.word[i] = clamp&lt;u64&gt;(temp, 0, 4294967295);
+  } else {
+    dst.word[i] = 0;
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_123">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvssrlrni_b_h-__m256i-a-__m256i-b-imm0_15-imm">__m256i __lasx_xvssrlrni_b_h (__m256i a, __m256i b, imm0_15 imm)</h2>
+<h3 id="synopsis_124">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvssrlrni_b_h (__m256i a, __m256i b, imm0_15 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvssrlrni.b.h xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_124">Description</h3>
+<p>Logical right shift (with rounding) the unsigned 16-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in signed 8-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_124">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  if (i &lt; 8) {
+    u16 temp;
+    if (imm == 0) {
+      temp = (u16)b.half[i];
+    } else {
+      temp = ((u16)b.half[i] &gt;&gt; imm) + (((u16)b.half[i] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.byte[i] = clamp&lt;u16&gt;(temp, 0, 127);
+  } else {
+    u16 temp;
+    if (imm == 0) {
+      temp = (u16)a.half[i - 8];
+    } else {
+      temp =
+          ((u16)a.half[i - 8] &gt;&gt; imm) + (((u16)a.half[i - 8] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.byte[i] = clamp&lt;u16&gt;(temp, 0, 127);
+  }
+}
+for (int i = 16; i &lt; 32; i++) {
+  if (i &lt; 24) {
+    u16 temp;
+    if (imm == 0) {
+      temp = (u16)b.half[i - 8];
+    } else {
+      temp =
+          ((u16)b.half[i - 8] &gt;&gt; imm) + (((u16)b.half[i - 8] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.byte[i] = clamp&lt;u16&gt;(temp, 0, 127);
+  } else {
+    u16 temp;
+    if (imm == 0) {
+      temp = (u16)a.half[i - 16];
+    } else {
+      temp = ((u16)a.half[i - 16] &gt;&gt; imm) +
+             (((u16)a.half[i - 16] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.byte[i] = clamp&lt;u16&gt;(temp, 0, 127);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_124">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvssrlrni_bu_h-__m256i-a-__m256i-b-imm0_15-imm">__m256i __lasx_xvssrlrni_bu_h (__m256i a, __m256i b, imm0_15 imm)</h2>
+<h3 id="synopsis_125">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvssrlrni_bu_h (__m256i a, __m256i b, imm0_15 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvssrlrni.bu.h xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_125">Description</h3>
+<p>Logical right shift (with rounding) the unsigned 16-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in unsigned 8-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_125">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  if (i &lt; 8) {
+    u16 temp;
+    if (imm == 0) {
+      temp = (u16)b.half[i];
+    } else {
+      temp = ((u16)b.half[i] &gt;&gt; imm) + (((u16)b.half[i] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.byte[i] = clamp&lt;u16&gt;(temp, 0, 255);
+  } else {
+    u16 temp;
+    if (imm == 0) {
+      temp = (u16)a.half[i - 8];
+    } else {
+      temp =
+          ((u16)a.half[i - 8] &gt;&gt; imm) + (((u16)a.half[i - 8] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.byte[i] = clamp&lt;u16&gt;(temp, 0, 255);
+  }
+}
+for (int i = 16; i &lt; 32; i++) {
+  if (i &lt; 24) {
+    u16 temp;
+    if (imm == 0) {
+      temp = (u16)b.half[i - 8];
+    } else {
+      temp =
+          ((u16)b.half[i - 8] &gt;&gt; imm) + (((u16)b.half[i - 8] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.byte[i] = clamp&lt;u16&gt;(temp, 0, 255);
+  } else {
+    u16 temp;
+    if (imm == 0) {
+      temp = (u16)a.half[i - 16];
+    } else {
+      temp = ((u16)a.half[i - 16] &gt;&gt; imm) +
+             (((u16)a.half[i - 16] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.byte[i] = clamp&lt;u16&gt;(temp, 0, 255);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_125">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvssrlrni_h_w-__m256i-a-__m256i-b-imm0_31-imm">__m256i __lasx_xvssrlrni_h_w (__m256i a, __m256i b, imm0_31 imm)</h2>
+<h3 id="synopsis_126">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvssrlrni_h_w (__m256i a, __m256i b, imm0_31 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvssrlrni.h.w xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_126">Description</h3>
+<p>Logical right shift (with rounding) the unsigned 32-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in signed 16-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_126">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  if (i &lt; 4) {
+    u32 temp;
+    if (imm == 0) {
+      temp = (u32)b.word[i];
+    } else {
+      temp = ((u32)b.word[i] &gt;&gt; imm) + (((u32)b.word[i] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.half[i] = clamp&lt;u32&gt;(temp, 0, 32767);
+  } else {
+    u32 temp;
+    if (imm == 0) {
+      temp = (u32)a.word[i - 4];
+    } else {
+      temp =
+          ((u32)a.word[i - 4] &gt;&gt; imm) + (((u32)a.word[i - 4] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.half[i] = clamp&lt;u32&gt;(temp, 0, 32767);
+  }
+}
+for (int i = 8; i &lt; 16; i++) {
+  if (i &lt; 12) {
+    u32 temp;
+    if (imm == 0) {
+      temp = (u32)b.word[i - 4];
+    } else {
+      temp =
+          ((u32)b.word[i - 4] &gt;&gt; imm) + (((u32)b.word[i - 4] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.half[i] = clamp&lt;u32&gt;(temp, 0, 32767);
+  } else {
+    u32 temp;
+    if (imm == 0) {
+      temp = (u32)a.word[i - 8];
+    } else {
+      temp =
+          ((u32)a.word[i - 8] &gt;&gt; imm) + (((u32)a.word[i - 8] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.half[i] = clamp&lt;u32&gt;(temp, 0, 32767);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_126">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvssrlrni_hu_w-__m256i-a-__m256i-b-imm0_31-imm">__m256i __lasx_xvssrlrni_hu_w (__m256i a, __m256i b, imm0_31 imm)</h2>
+<h3 id="synopsis_127">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvssrlrni_hu_w (__m256i a, __m256i b, imm0_31 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvssrlrni.hu.w xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_127">Description</h3>
+<p>Logical right shift (with rounding) the unsigned 32-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in unsigned 16-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_127">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  if (i &lt; 4) {
+    u32 temp;
+    if (imm == 0) {
+      temp = (u32)b.word[i];
+    } else {
+      temp = ((u32)b.word[i] &gt;&gt; imm) + (((u32)b.word[i] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.half[i] = clamp&lt;u32&gt;(temp, 0, 65535);
+  } else {
+    u32 temp;
+    if (imm == 0) {
+      temp = (u32)a.word[i - 4];
+    } else {
+      temp =
+          ((u32)a.word[i - 4] &gt;&gt; imm) + (((u32)a.word[i - 4] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.half[i] = clamp&lt;u32&gt;(temp, 0, 65535);
+  }
+}
+for (int i = 8; i &lt; 16; i++) {
+  if (i &lt; 12) {
+    u32 temp;
+    if (imm == 0) {
+      temp = (u32)b.word[i - 4];
+    } else {
+      temp =
+          ((u32)b.word[i - 4] &gt;&gt; imm) + (((u32)b.word[i - 4] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.half[i] = clamp&lt;u32&gt;(temp, 0, 65535);
+  } else {
+    u32 temp;
+    if (imm == 0) {
+      temp = (u32)a.word[i - 8];
+    } else {
+      temp =
+          ((u32)a.word[i - 8] &gt;&gt; imm) + (((u32)a.word[i - 8] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.half[i] = clamp&lt;u32&gt;(temp, 0, 65535);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_127">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvssrlrni_w_d-__m256i-a-__m256i-b-imm0_63-imm">__m256i __lasx_xvssrlrni_w_d (__m256i a, __m256i b, imm0_63 imm)</h2>
+<h3 id="synopsis_128">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvssrlrni_w_d (__m256i a, __m256i b, imm0_63 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvssrlrni.w.d xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_128">Description</h3>
+<p>Logical right shift (with rounding) the unsigned 64-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in signed 32-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_128">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (i &lt; 2) {
+    u64 temp;
+    if (imm == 0) {
+      temp = (u64)b.dword[i];
+    } else {
+      temp = ((u64)b.dword[i] &gt;&gt; imm) + (((u64)b.dword[i] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.word[i] = clamp&lt;u64&gt;(temp, 0, 2147483647);
+  } else {
+    u64 temp;
+    if (imm == 0) {
+      temp = (u64)a.dword[i - 2];
+    } else {
+      temp = ((u64)a.dword[i - 2] &gt;&gt; imm) +
+             (((u64)a.dword[i - 2] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.word[i] = clamp&lt;u64&gt;(temp, 0, 2147483647);
+  }
+}
+for (int i = 4; i &lt; 8; i++) {
+  if (i &lt; 6) {
+    u64 temp;
+    if (imm == 0) {
+      temp = (u64)b.dword[i - 2];
+    } else {
+      temp = ((u64)b.dword[i - 2] &gt;&gt; imm) +
+             (((u64)b.dword[i - 2] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.word[i] = clamp&lt;u64&gt;(temp, 0, 2147483647);
+  } else {
+    u64 temp;
+    if (imm == 0) {
+      temp = (u64)a.dword[i - 4];
+    } else {
+      temp = ((u64)a.dword[i - 4] &gt;&gt; imm) +
+             (((u64)a.dword[i - 4] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.word[i] = clamp&lt;u64&gt;(temp, 0, 2147483647);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_128">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvssrlrni_wu_d-__m256i-a-__m256i-b-imm0_63-imm">__m256i __lasx_xvssrlrni_wu_d (__m256i a, __m256i b, imm0_63 imm)</h2>
+<h3 id="synopsis_129">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvssrlrni_wu_d (__m256i a, __m256i b, imm0_63 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvssrlrni.wu.d xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_129">Description</h3>
+<p>Logical right shift (with rounding) the unsigned 64-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in unsigned 32-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_129">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (i &lt; 2) {
+    u64 temp;
+    if (imm == 0) {
+      temp = (u64)b.dword[i];
+    } else {
+      temp = ((u64)b.dword[i] &gt;&gt; imm) + (((u64)b.dword[i] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.word[i] = clamp&lt;u64&gt;(temp, 0, 4294967295);
+  } else {
+    u64 temp;
+    if (imm == 0) {
+      temp = (u64)a.dword[i - 2];
+    } else {
+      temp = ((u64)a.dword[i - 2] &gt;&gt; imm) +
+             (((u64)a.dword[i - 2] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.word[i] = clamp&lt;u64&gt;(temp, 0, 4294967295);
+  }
+}
+for (int i = 4; i &lt; 8; i++) {
+  if (i &lt; 6) {
+    u64 temp;
+    if (imm == 0) {
+      temp = (u64)b.dword[i - 2];
+    } else {
+      temp = ((u64)b.dword[i - 2] &gt;&gt; imm) +
+             (((u64)b.dword[i - 2] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.word[i] = clamp&lt;u64&gt;(temp, 0, 4294967295);
+  } else {
+    u64 temp;
+    if (imm == 0) {
+      temp = (u64)a.dword[i - 4];
+    } else {
+      temp = ((u64)a.dword[i - 4] &gt;&gt; imm) +
+             (((u64)a.dword[i - 4] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.word[i] = clamp&lt;u64&gt;(temp, 0, 4294967295);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_129">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvssrlrni_d_q-__m256i-a-__m256i-b-imm0_127-imm">__m256i __lasx_xvssrlrni_d_q (__m256i a, __m256i b, imm0_127 imm)</h2>
+<h3 id="synopsis_130">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvssrlrni_d_q (__m256i a, __m256i b, imm0_127 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvssrlrni.d.q xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_130">Description</h3>
+<p>Logical right shift (with rounding) the unsigned 128-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in signed 64-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_130">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  if (i &lt; 1) {
+    u128 temp;
+    if (imm == 0) {
+      temp = (u128)b.qword[i];
+    } else {
+      temp = ((u128)b.qword[i] &gt;&gt; imm) + (((u128)b.qword[i] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.dword[i] = clamp&lt;u128&gt;(temp, 0, 9223372036854775807);
+  } else {
+    u128 temp;
+    if (imm == 0) {
+      temp = (u128)a.qword[i - 1];
+    } else {
+      temp = ((u128)a.qword[i - 1] &gt;&gt; imm) +
+             (((u128)a.qword[i - 1] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.dword[i] = clamp&lt;u128&gt;(temp, 0, 9223372036854775807);
+  }
+}
+for (int i = 2; i &lt; 4; i++) {
+  if (i &lt; 3) {
+    u128 temp;
+    if (imm == 0) {
+      temp = (u128)b.qword[i - 1];
+    } else {
+      temp = ((u128)b.qword[i - 1] &gt;&gt; imm) +
+             (((u128)b.qword[i - 1] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.dword[i] = clamp&lt;u128&gt;(temp, 0, 9223372036854775807);
+  } else {
+    u128 temp;
+    if (imm == 0) {
+      temp = (u128)a.qword[i - 2];
+    } else {
+      temp = ((u128)a.qword[i - 2] &gt;&gt; imm) +
+             (((u128)a.qword[i - 2] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.dword[i] = clamp&lt;u128&gt;(temp, 0, 9223372036854775807);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_130">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvssrlrni_du_q-__m256i-a-__m256i-b-imm0_127-imm">__m256i __lasx_xvssrlrni_du_q (__m256i a, __m256i b, imm0_127 imm)</h2>
+<h3 id="synopsis_131">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvssrlrni_du_q (__m256i a, __m256i b, imm0_127 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvssrlrni.du.q xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_131">Description</h3>
+<p>Logical right shift (with rounding) the unsigned 128-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in unsigned 64-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_131">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  if (i &lt; 1) {
+    u128 temp;
+    if (imm == 0) {
+      temp = (u128)b.qword[i];
+    } else {
+      temp = ((u128)b.qword[i] &gt;&gt; imm) + (((u128)b.qword[i] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.dword[i] = clamp&lt;u128&gt;(temp, 0, 18446744073709551615);
+  } else {
+    u128 temp;
+    if (imm == 0) {
+      temp = (u128)a.qword[i - 1];
+    } else {
+      temp = ((u128)a.qword[i - 1] &gt;&gt; imm) +
+             (((u128)a.qword[i - 1] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.dword[i] = clamp&lt;u128&gt;(temp, 0, 18446744073709551615);
+  }
+}
+for (int i = 2; i &lt; 4; i++) {
+  if (i &lt; 3) {
+    u128 temp;
+    if (imm == 0) {
+      temp = (u128)b.qword[i - 1];
+    } else {
+      temp = ((u128)b.qword[i - 1] &gt;&gt; imm) +
+             (((u128)b.qword[i - 1] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.dword[i] = clamp&lt;u128&gt;(temp, 0, 18446744073709551615);
+  } else {
+    u128 temp;
+    if (imm == 0) {
+      temp = (u128)a.qword[i - 2];
+    } else {
+      temp = ((u128)a.qword[i - 2] &gt;&gt; imm) +
+             (((u128)a.qword[i - 2] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.dword[i] = clamp&lt;u128&gt;(temp, 0, 18446744073709551615);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_131">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvrotr_b-__m256i-a-__m256i-b">__m256i __lasx_xvrotr_b (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_132">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvrotr_b (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvrotr.b xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_132">Description</h3>
+<p>Rotate right the unsigned 8-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_132">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 32; i++) {
+  dst.byte[i] =
+      (a.byte[i] &gt;&gt; (b.byte[i] &amp; 0x7)) | (a.byte[i] &lt;&lt; (8 - (b.byte[i] &amp; 0x7)));
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_132">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvrotr_h-__m256i-a-__m256i-b">__m256i __lasx_xvrotr_h (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_133">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvrotr_h (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvrotr.h xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_133">Description</h3>
+<p>Rotate right the unsigned 16-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_133">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = (a.half[i] &gt;&gt; (b.half[i] &amp; 0xf)) |
+                (a.half[i] &lt;&lt; (16 - (b.half[i] &amp; 0xf)));
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_133">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvrotr_w-__m256i-a-__m256i-b">__m256i __lasx_xvrotr_w (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_134">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvrotr_w (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvrotr.w xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_134">Description</h3>
+<p>Rotate right the unsigned 32-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_134">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = (a.word[i] &gt;&gt; (b.word[i] &amp; 0x1f)) |
+                (a.word[i] &lt;&lt; (32 - (b.word[i] &amp; 0x1f)));
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_134">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvrotr_d-__m256i-a-__m256i-b">__m256i __lasx_xvrotr_d (__m256i a, __m256i b)</h2>
+<h3 id="synopsis_135">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvrotr_d (__m256i a, __m256i b)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvrotr.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_135">Description</h3>
+<p>Rotate right the unsigned 64-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_135">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = (a.dword[i] &gt;&gt; (b.dword[i] &amp; 0x3f)) |
+                 (a.dword[i] &lt;&lt; (64 - (b.dword[i] &amp; 0x3f)));
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_135">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvrotri_b-__m256i-a-imm0_7-imm">__m256i __lasx_xvrotri_b (__m256i a, imm0_7 imm)</h2>
+<h3 id="synopsis_136">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvrotri_b (__m256i a, imm0_7 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvrotri.b xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_136">Description</h3>
+<p>Rotate right the unsigned 8-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_136">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 32; i++) {
+  dst.byte[i] = (a.byte[i] &gt;&gt; imm) | (a.byte[i] &lt;&lt; (8 - imm));
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_136">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvrotri_h-__m256i-a-imm0_15-imm">__m256i __lasx_xvrotri_h (__m256i a, imm0_15 imm)</h2>
+<h3 id="synopsis_137">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvrotri_h (__m256i a, imm0_15 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvrotri.h xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_137">Description</h3>
+<p>Rotate right the unsigned 16-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_137">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = (a.half[i] &gt;&gt; imm) | (a.half[i] &lt;&lt; (16 - imm));
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_137">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvrotri_w-__m256i-a-imm0_31-imm">__m256i __lasx_xvrotri_w (__m256i a, imm0_31 imm)</h2>
+<h3 id="synopsis_138">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvrotri_w (__m256i a, imm0_31 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvrotri.w xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_138">Description</h3>
+<p>Rotate right the unsigned 32-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_138">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = (a.word[i] &gt;&gt; imm) | (a.word[i] &lt;&lt; (32 - imm));
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_138">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvrotri_d-__m256i-a-imm0_63-imm">__m256i __lasx_xvrotri_d (__m256i a, imm0_63 imm)</h2>
+<h3 id="synopsis_139">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvrotri_d (__m256i a, imm0_63 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvrotri.d xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_139">Description</h3>
+<p>Rotate right the unsigned 64-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_139">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.dword[i] = (a.dword[i] &gt;&gt; imm) | (a.dword[i] &lt;&lt; (64 - imm));
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_139">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+              
+            </div>
+          </div><footer>
+    <div class="rst-footer-buttons" role="navigation" aria-label="Footer Navigation">
+        <a href="../permutation/" class="btn btn-neutral float-left" title="Permutation"><span class="icon icon-circle-arrow-left"></span> Previous</a>
+        <a href="../shuffling/" class="btn btn-neutral float-right" title="Shuffling">Next <span class="icon icon-circle-arrow-right"></span></a>
+    </div>
+
+  <hr/>
+
+  <div role="contentinfo">
+    <!-- Copyright etc -->
+      <p>Copyright &copy; 2023 Jiajie Chen</p>
+  </div>
+
+  Built with <a href="https://www.mkdocs.org/">MkDocs</a> using a <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
+</footer>
+          
+        </div>
+      </div>
+
+    </section>
+
+  </div>
+
+  <div class="rst-versions" role="note" aria-label="Versions">
+  <span class="rst-current-version" data-toggle="rst-current-version">
+    
+    
+      <span><a href="../permutation/" style="color: #fcfcfc">&laquo; Previous</a></span>
+    
+    
+      <span><a href="../shuffling/" style="color: #fcfcfc">Next &raquo;</a></span>
+    
+  </span>
+</div>
+    <script src="../../js/jquery-3.6.0.min.js"></script>
+    <script>var base_url = "../..";</script>
+    <script src="../../js/theme_extra.js"></script>
+    <script src="../../js/theme.js"></script>
+    <script>
+        jQuery(function () {
+            SphinxRtdTheme.Navigation.enable(true);
+        });
+    </script>
+
+</body>
+</html>
diff --git a/lasx/shuffling/index.html b/lasx/shuffling/index.html
new file mode 100644
index 00000000..8c527ddf
--- /dev/null
+++ b/lasx/shuffling/index.html
@@ -0,0 +1,679 @@
+<!DOCTYPE html>
+<html class="writer-html5" lang="en" >
+<head>
+    <meta charset="utf-8" />
+    <meta http-equiv="X-UA-Compatible" content="IE=edge" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" /><link rel="canonical" href="https://jia.je/unofficial-loongarch-intrinsics-guide/lasx/shuffling/" />
+      <link rel="shortcut icon" href="../../img/favicon.ico" />
+    <title>Shuffling - Unofficial LoongArch Intrinsics Guide</title>
+    <link rel="stylesheet" href="../../css/theme.css" />
+    <link rel="stylesheet" href="../../css/theme_extra.css" />
+        <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.8.0/styles/github.min.css" />
+        <link href="../../main.css" rel="stylesheet" />
+    
+      <script>
+        // Current page data
+        var mkdocs_page_name = "Shuffling";
+        var mkdocs_page_input_path = "lasx/shuffling.md";
+        var mkdocs_page_url = "/unofficial-loongarch-intrinsics-guide/lasx/shuffling/";
+      </script>
+    
+    <!--[if lt IE 9]>
+      <script src="../../js/html5shiv.min.js"></script>
+    <![endif]-->
+      <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.8.0/highlight.min.js"></script>
+      <script>hljs.highlightAll();</script> 
+</head>
+
+<body class="wy-body-for-nav" role="document">
+
+  <div class="wy-grid-for-nav">
+    <nav data-toggle="wy-nav-shift" class="wy-nav-side stickynav">
+    <div class="wy-side-scroll">
+      <div class="wy-side-nav-search">
+          <a href="../.." class="icon icon-home"> Unofficial LoongArch Intrinsics Guide
+        </a>
+      </div>
+
+      <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../..">Unofficial LoongArch Intrinsics Guide</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../latency_throughput/">Latency and Throughput of Instructions</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../migrating_avx/">Migrating from AVX to LASX</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../migrating_sse/">Migrating from SSE to LSX</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../viewer/">Browse All Intrinsics</a>
+                </li>
+              </ul>
+              <p class="caption"><span class="caption-text">Lasx</span></p>
+              <ul class="current">
+                  <li class="toctree-l1"><a class="reference internal" href="../bitwise_operations/">Bitwise Operations</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../branch/">Branch</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_comparison/">Floating Point Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_computation/">Floating Point Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_conversion/">Floating Point Conversion</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_misc/">Floating Point Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../fma/">Fused Multiply-Add</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../integer_comparison/">Integer Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../integer_computation/">Integer Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../logical/">Logical</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../memory/">Memory Load & Store</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../misc/">Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../permutation/">Permutation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../shift/">Shift</a>
+                  </li>
+                  <li class="toctree-l1 current"><a class="reference internal current" href="./">Shuffling</a>
+    <ul class="current">
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvshuf_b-__m256i-a-__m256i-b-__m256i-c">__m256i __lasx_xvshuf_b (__m256i a, __m256i b, __m256i c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvshuf_h-__m256i-a-__m256i-b-__m256i-c">__m256i __lasx_xvshuf_h (__m256i a, __m256i b, __m256i c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_1">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_1">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_1">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_1">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_1">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvshuf_w-__m256i-a-__m256i-b-__m256i-c">__m256i __lasx_xvshuf_w (__m256i a, __m256i b, __m256i c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_2">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_2">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_2">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_2">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_2">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvshuf_d-__m256i-a-__m256i-b-__m256i-c">__m256i __lasx_xvshuf_d (__m256i a, __m256i b, __m256i c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_3">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_3">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_3">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_3">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_3">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvshuf4i_b-__m256i-a-imm0_255-imm">__m256i __lasx_xvshuf4i_b (__m256i a, imm0_255 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_4">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_4">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_4">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_4">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_4">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvshuf4i_h-__m256i-a-imm0_255-imm">__m256i __lasx_xvshuf4i_h (__m256i a, imm0_255 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_5">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_5">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_5">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_5">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_5">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvshuf4i_w-__m256i-a-imm0_255-imm">__m256i __lasx_xvshuf4i_w (__m256i a, imm0_255 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_6">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_6">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_6">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_6">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_6">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m256i-__lasx_xvshuf4i_d-__m256i-a-__m256i-b-imm0_255-imm">__m256i __lasx_xvshuf4i_d (__m256i a, __m256i b, imm0_255 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_7">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_7">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_7">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_7">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_7">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    </ul>
+                  </li>
+              </ul>
+              <p class="caption"><span class="caption-text">Lsx</span></p>
+              <ul>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/bitwise_operations/">Bitwise Operations</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/branch/">Branch</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/float_comparison/">Floating Point Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/float_computation/">Floating Point Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/float_conversion/">Floating Point Conversion</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/float_misc/">Floating Point Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/fma/">Fused Multiply-Add</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/integer_comparison/">Integer Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/integer_computation/">Integer Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/logical/">Logical</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/memory/">Memory Load & Store</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/misc/">Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/permutation/">Permutation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/shift/">Shift</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lsx/shuffling/">Shuffling</a>
+                  </li>
+              </ul>
+      </div>
+    </div>
+    </nav>
+
+    <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
+      <nav class="wy-nav-top" role="navigation" aria-label="Mobile navigation menu">
+          <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
+          <a href="../..">Unofficial LoongArch Intrinsics Guide</a>
+        
+      </nav>
+      <div class="wy-nav-content">
+        <div class="rst-content"><div role="navigation" aria-label="breadcrumbs navigation">
+  <ul class="wy-breadcrumbs">
+    <li><a href="../.." class="icon icon-home" aria-label="Docs"></a></li>
+          <li class="breadcrumb-item">Lasx</li>
+      <li class="breadcrumb-item active">Shuffling</li>
+    <li class="wy-breadcrumbs-aside">
+    </li>
+  </ul>
+  <hr/>
+</div>
+          <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
+            <div class="section" itemprop="articleBody">
+              
+                <h1 id="shuffling">Shuffling</h1>
+<h2 id="__m256i-__lasx_xvshuf_b-__m256i-a-__m256i-b-__m256i-c">__m256i __lasx_xvshuf_b (__m256i a, __m256i b, __m256i c)</h2>
+<h3 id="synopsis">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvshuf_b (__m256i a, __m256i b, __m256i c)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvshuf.b xr, xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description">Description</h3>
+<p>Shuffle bytes from <code>a</code> and <code>b</code> with indices from <code>c</code>.</p>
+<p>Caveat: the indices are placed in <code>c</code>, while in other <code>vshuf</code> intrinsics, they are placed in <code>a</code>.</p>
+<p><img alt="" src="../../diagram/xvshuf_b.svg" /></p>
+<h3 id="examples">Examples</h3>
+<pre><code class="language-c++">__m256i __lasx_xvshuf_b(__m256i{0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{0xabcdef1314156678, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678}, __m256i{0x1f1f00001a0a1b0b, 0x1111120213031404, 0x0102030405060708, 0x1112131405060708})
+= 0x99997878ee21dd43 0x7777661555144413 0x4321433412341278 0x1234121212341278
+</code></pre>
+<h3 id="operation">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 32; i++) {
+  if ((c.byte[i] % 256) &gt;= 64 &amp;&amp; MACHINE_3C5000) {
+    // Caveat: observed in 3C5000
+    dst.byte[i] = 0;
+  } else if ((c.byte[i] % 32) &lt; 16) {
+    dst.byte[i] = b.byte[(c.byte[i] % 32) + ((i &gt;= 16) ? 16 : 0)];
+  } else {
+    dst.byte[i] = a.byte[(c.byte[i] % 32) + ((i &gt;= 16) ? 0 : -16)];
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvshuf_h-__m256i-a-__m256i-b-__m256i-c">__m256i __lasx_xvshuf_h (__m256i a, __m256i b, __m256i c)</h2>
+<h3 id="synopsis_1">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvshuf_h (__m256i a, __m256i b, __m256i c)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvshuf.h xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_1">Description</h3>
+<p>Shuffle 16-bit elements in <code>b</code> and <code>c</code> with indices from <code>a</code>, save the result to <code>dst</code>.</p>
+<p><img alt="" src="../../diagram/xvshuf_h.svg" /></p>
+<h3 id="examples_1">Examples</h3>
+<pre><code class="language-c++">__m256i __lasx_xvshuf_h(__m256i{0x0001000200030004, 0x0005000a000b000c, 0x000f000e00010002, 0x0008000900020001}, __m256i{0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{0xabcdef1314156678, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678})
+= 0x1415ef13abcd4321 0x432133441122ff00 0xaabbaabb43211234 0x1234123412344321
+</code></pre>
+<h3 id="operation_1">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  if ((a.half[i] % 256) &gt;= 64 &amp;&amp; MACHINE_3C5000) {
+    // Caveat: observed in 3C5000
+    dst.half[i] = 0;
+  } else if ((a.half[i] % 16) &lt; 8) {
+    dst.half[i] = c.half[(a.half[i] % 16) + ((i &gt;= 8) ? 8 : 0)];
+  } else {
+    dst.half[i] = b.half[(a.half[i] % 16) + ((i &gt;= 8) ? 0 : -8)];
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_1">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvshuf_w-__m256i-a-__m256i-b-__m256i-c">__m256i __lasx_xvshuf_w (__m256i a, __m256i b, __m256i c)</h2>
+<h3 id="synopsis_2">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvshuf_w (__m256i a, __m256i b, __m256i c)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvshuf.w xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_2">Description</h3>
+<p>Shuffle 32-bit elements in <code>b</code> and <code>c</code> with indices from <code>a</code>, save the result to <code>dst</code>.</p>
+<p><img alt="" src="../../diagram/xvshuf_w.svg" /></p>
+<h3 id="examples_2">Examples</h3>
+<pre><code class="language-c++">__m256i __lasx_xvshuf_w(__m256i{0x0000000200000004, 0x0000000700000005, 0x0000000100000003, 0x0000000400000000}, __m256i{0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{0xabcdef1314156678, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678})
+= 0x4321432155667788 0x99aabbcc11223344 0x1234123456785678 0x1234123443214321
+</code></pre>
+<h3 id="operation_2">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  if ((a.word[i] % 256) &gt;= 64 &amp;&amp; MACHINE_3C5000) {
+    // Caveat: observed in 3C5000
+    dst.word[i] = 0;
+  } else if ((a.word[i] % 8) &lt; 4) {
+    dst.word[i] = c.word[(a.word[i] % 8) + ((i &gt;= 4) ? 4 : 0)];
+  } else {
+    dst.word[i] = b.word[(a.word[i] % 8) + ((i &gt;= 4) ? 0 : -4)];
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_2">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvshuf_d-__m256i-a-__m256i-b-__m256i-c">__m256i __lasx_xvshuf_d (__m256i a, __m256i b, __m256i c)</h2>
+<h3 id="synopsis_3">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvshuf_d (__m256i a, __m256i b, __m256i c)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvshuf.d xr, xr, xr
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_3">Description</h3>
+<p>Shuffle 64-bit elements in <code>b</code> and <code>c</code> with indices from <code>a</code>, save the result to <code>dst</code>.</p>
+<p><img alt="" src="../../diagram/xvshuf_d.svg" /></p>
+<h3 id="examples_3">Examples</h3>
+<pre><code class="language-c++">__m256i __lasx_xvshuf_d(__m256i{0x0000000000000000, 0x0000000000000003, 0x0000000000000002, 0x0000000000000001}, __m256i{0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{0xabcdef1314156678, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678})
+= 0xabcdef1314156678 0x99aabbccddeeff00 0xabcdef1212341234 0x5678567856785678
+</code></pre>
+<h3 id="operation_3">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if ((a.dword[i] % 256) &gt;= 64 &amp;&amp; MACHINE_3C5000) {
+    // Caveat: observed in 3C5000
+    dst.dword[i] = 0;
+  } else if ((a.dword[i] % 4) &lt; 2) {
+    dst.dword[i] = c.dword[(a.dword[i] % 4) + ((i &gt;= 2) ? 2 : 0)];
+  } else {
+    dst.dword[i] = b.dword[(a.dword[i] % 4) + ((i &gt;= 2) ? 0 : -2)];
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_3">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvshuf4i_b-__m256i-a-imm0_255-imm">__m256i __lasx_xvshuf4i_b (__m256i a, imm0_255 imm)</h2>
+<h3 id="synopsis_4">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvshuf4i_b (__m256i a, imm0_255 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvshuf4i.b xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_4">Description</h3>
+<p>Shuffle every four 8-bit elements in <code>a</code> with indices packed in <code>imm</code>, save the result to <code>dst</code>.</p>
+<p><img alt="" src="../../diagram/xvshuf4i_b.svg" /></p>
+<h3 id="examples_4">Examples</h3>
+<pre><code class="language-c++">__m256i __lasx_xvshuf4i_b( __m256i{ 0xabcdef1314156678, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678}, 0x12)
+= 0x13ef13cd78667815 0x3412343421432121 0x3412343421432121 0x7856787878567878
+</code></pre>
+<h3 id="operation_4">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 32; i++) {
+  dst.byte[i] = a.byte[(i &amp; ~0x3) + ((imm &gt;&gt; (2 * (i &amp; 0x3))) &amp; 0x3)];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_4">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvshuf4i_h-__m256i-a-imm0_255-imm">__m256i __lasx_xvshuf4i_h (__m256i a, imm0_255 imm)</h2>
+<h3 id="synopsis_5">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvshuf4i_h (__m256i a, imm0_255 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvshuf4i.h xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_5">Description</h3>
+<p>Shuffle every four 16-bit elements in <code>a</code> with indices packed in <code>imm</code>, save the result to <code>dst</code>.</p>
+<p><img alt="" src="../../diagram/xvshuf4i_h.svg" /></p>
+<h3 id="examples_5">Examples</h3>
+<pre><code class="language-c++">__m256i __lasx_xvshuf4i_h( __m256i{ 0xabcdef1314156678, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678}, 0x12)
+= 0x667814156678ef13 0x4321432143211234 0x4321432143211234 0x5678567856785678
+</code></pre>
+<h3 id="operation_5">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.half[i] = a.half[(i &amp; ~0x3) + ((imm &gt;&gt; (2 * (i &amp; 0x3))) &amp; 0x3)];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_5">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvshuf4i_w-__m256i-a-imm0_255-imm">__m256i __lasx_xvshuf4i_w (__m256i a, imm0_255 imm)</h2>
+<h3 id="synopsis_6">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvshuf4i_w (__m256i a, imm0_255 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvshuf4i.w xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_6">Description</h3>
+<p>Shuffle every four 32-bit elements in <code>a</code> with indices packed in <code>imm</code>, save the result to <code>dst</code>.</p>
+<p><img alt="" src="../../diagram/xvshuf4i_w.svg" /></p>
+<h3 id="examples_6">Examples</h3>
+<pre><code class="language-c++">__m256i __lasx_xvshuf4i_w( __m256i{ 0xabcdef1314156678, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678}, 0x12)
+= 0x1415667843214321 0x14156678abcdef13 0x4321432156785678 0x4321432112341234
+</code></pre>
+<h3 id="operation_6">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.word[i] = a.word[(i &amp; ~0x3) + ((imm &gt;&gt; (2 * (i &amp; 0x3))) &amp; 0x3)];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_6">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m256i-__lasx_xvshuf4i_d-__m256i-a-__m256i-b-imm0_255-imm">__m256i __lasx_xvshuf4i_d (__m256i a, __m256i b, imm0_255 imm)</h2>
+<h3 id="synopsis_7">Synopsis</h3>
+<pre><code class="language-c++">__m256i __lasx_xvshuf4i_d (__m256i a, __m256i b, imm0_255 imm)
+#include &lt;lasxintrin.h&gt;
+Instruction: xvshuf4i.d xr, xr, imm
+CPU Flags: LASX
+</code></pre>
+<h3 id="description_7">Description</h3>
+<p>Shuffle every four 64-bit elements in <code>a</code> and <code>b</code> with indices packed in <code>imm</code>, save the result to <code>dst</code>.</p>
+<p><img alt="" src="../../diagram/xvshuf4i_d.svg" /></p>
+<h3 id="examples_7">Examples</h3>
+<pre><code class="language-c++">__m256i __lasx_xvshuf4i_d( __m256i{ 0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{ 0xabcdef1314156678, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678}, 0x12)
+= 0xabcdef1314156678 0x1122334455667788 0x1234123443214321 0xabcdef1212341234
+</code></pre>
+<h3 id="operation_7">Operation</h3>
+<pre><code class="language-c++">dst.dword[0] = (imm &amp; 2) ? b.dword[(imm &amp; 1)] : a.dword[(imm &amp; 1)];
+dst.dword[1] =
+    (imm &amp; 8) ? b.dword[((imm &gt;&gt; 2) &amp; 1)] : a.dword[((imm &gt;&gt; 2) &amp; 1)];
+dst.dword[2] = (imm &amp; 2) ? b.dword[(imm &amp; 1) + 2] : a.dword[(imm &amp; 1) + 2];
+dst.dword[3] =
+    (imm &amp; 8) ? b.dword[((imm &gt;&gt; 2) &amp; 1) + 2] : a.dword[((imm &gt;&gt; 2) &amp; 1) + 2];
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_7">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+              
+            </div>
+          </div><footer>
+    <div class="rst-footer-buttons" role="navigation" aria-label="Footer Navigation">
+        <a href="../shift/" class="btn btn-neutral float-left" title="Shift"><span class="icon icon-circle-arrow-left"></span> Previous</a>
+        <a href="../../lsx/bitwise_operations/" class="btn btn-neutral float-right" title="Bitwise Operations">Next <span class="icon icon-circle-arrow-right"></span></a>
+    </div>
+
+  <hr/>
+
+  <div role="contentinfo">
+    <!-- Copyright etc -->
+      <p>Copyright &copy; 2023 Jiajie Chen</p>
+  </div>
+
+  Built with <a href="https://www.mkdocs.org/">MkDocs</a> using a <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
+</footer>
+          
+        </div>
+      </div>
+
+    </section>
+
+  </div>
+
+  <div class="rst-versions" role="note" aria-label="Versions">
+  <span class="rst-current-version" data-toggle="rst-current-version">
+    
+    
+      <span><a href="../shift/" style="color: #fcfcfc">&laquo; Previous</a></span>
+    
+    
+      <span><a href="../../lsx/bitwise_operations/" style="color: #fcfcfc">Next &raquo;</a></span>
+    
+  </span>
+</div>
+    <script src="../../js/jquery-3.6.0.min.js"></script>
+    <script>var base_url = "../..";</script>
+    <script src="../../js/theme_extra.js"></script>
+    <script src="../../js/theme.js"></script>
+    <script>
+        jQuery(function () {
+            SphinxRtdTheme.Navigation.enable(true);
+        });
+    </script>
+
+</body>
+</html>
diff --git a/latency_throughput/index.html b/latency_throughput/index.html
new file mode 100644
index 00000000..5b048e7d
--- /dev/null
+++ b/latency_throughput/index.html
@@ -0,0 +1,200 @@
+<!DOCTYPE html>
+<html class="writer-html5" lang="en" >
+<head>
+    <meta charset="utf-8" />
+    <meta http-equiv="X-UA-Compatible" content="IE=edge" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" /><link rel="canonical" href="https://jia.je/unofficial-loongarch-intrinsics-guide/latency_throughput/" />
+      <link rel="shortcut icon" href="../img/favicon.ico" />
+    <title>Latency and Throughput of Instructions - Unofficial LoongArch Intrinsics Guide</title>
+    <link rel="stylesheet" href="../css/theme.css" />
+    <link rel="stylesheet" href="../css/theme_extra.css" />
+        <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.8.0/styles/github.min.css" />
+        <link href="../main.css" rel="stylesheet" />
+    
+      <script>
+        // Current page data
+        var mkdocs_page_name = "Latency and Throughput of Instructions";
+        var mkdocs_page_input_path = "latency_throughput.md";
+        var mkdocs_page_url = "/unofficial-loongarch-intrinsics-guide/latency_throughput/";
+      </script>
+    
+    <!--[if lt IE 9]>
+      <script src="../js/html5shiv.min.js"></script>
+    <![endif]-->
+      <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.8.0/highlight.min.js"></script>
+      <script>hljs.highlightAll();</script> 
+</head>
+
+<body class="wy-body-for-nav" role="document">
+
+  <div class="wy-grid-for-nav">
+    <nav data-toggle="wy-nav-shift" class="wy-nav-side stickynav">
+    <div class="wy-side-scroll">
+      <div class="wy-side-nav-search">
+          <a href=".." class="icon icon-home"> Unofficial LoongArch Intrinsics Guide
+        </a>
+      </div>
+
+      <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="..">Unofficial LoongArch Intrinsics Guide</a>
+                </li>
+              </ul>
+              <ul class="current">
+                <li class="toctree-l1 current"><a class="reference internal current" href="./">Latency and Throughput of Instructions</a>
+    <ul class="current">
+    </ul>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../migrating_avx/">Migrating from AVX to LASX</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../migrating_sse/">Migrating from SSE to LSX</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../viewer/">Browse All Intrinsics</a>
+                </li>
+              </ul>
+              <p class="caption"><span class="caption-text">Lasx</span></p>
+              <ul>
+                  <li class="toctree-l1"><a class="reference internal" href="../lasx/bitwise_operations/">Bitwise Operations</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lasx/branch/">Branch</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lasx/float_comparison/">Floating Point Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lasx/float_computation/">Floating Point Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lasx/float_conversion/">Floating Point Conversion</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lasx/float_misc/">Floating Point Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lasx/fma/">Fused Multiply-Add</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lasx/integer_comparison/">Integer Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lasx/integer_computation/">Integer Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lasx/logical/">Logical</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lasx/memory/">Memory Load & Store</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lasx/misc/">Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lasx/permutation/">Permutation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lasx/shift/">Shift</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lasx/shuffling/">Shuffling</a>
+                  </li>
+              </ul>
+              <p class="caption"><span class="caption-text">Lsx</span></p>
+              <ul>
+                  <li class="toctree-l1"><a class="reference internal" href="../lsx/bitwise_operations/">Bitwise Operations</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lsx/branch/">Branch</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lsx/float_comparison/">Floating Point Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lsx/float_computation/">Floating Point Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lsx/float_conversion/">Floating Point Conversion</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lsx/float_misc/">Floating Point Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lsx/fma/">Fused Multiply-Add</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lsx/integer_comparison/">Integer Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lsx/integer_computation/">Integer Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lsx/logical/">Logical</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lsx/memory/">Memory Load & Store</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lsx/misc/">Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lsx/permutation/">Permutation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lsx/shift/">Shift</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lsx/shuffling/">Shuffling</a>
+                  </li>
+              </ul>
+      </div>
+    </div>
+    </nav>
+
+    <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
+      <nav class="wy-nav-top" role="navigation" aria-label="Mobile navigation menu">
+          <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
+          <a href="..">Unofficial LoongArch Intrinsics Guide</a>
+        
+      </nav>
+      <div class="wy-nav-content">
+        <div class="rst-content"><div role="navigation" aria-label="breadcrumbs navigation">
+  <ul class="wy-breadcrumbs">
+    <li><a href=".." class="icon icon-home" aria-label="Docs"></a></li>
+      <li class="breadcrumb-item active">Latency and Throughput of Instructions</li>
+    <li class="wy-breadcrumbs-aside">
+    </li>
+  </ul>
+  <hr/>
+</div>
+          <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
+            <div class="section" itemprop="articleBody">
+              
+                <h1 id="latency-and-throughput-of-instructions">Latency and Throughput of Instructions</h1>
+<p>Latency and throughput (CPI) of each instruction:</p>
+<table><thead><tr><th rowspan=2>Instruction</th><th colspan=2>3A6000</th><th colspan=2>3C5000</th></tr><tr><th>Latency</th><th>Throughput (CPI)</th><th>Latency</th><th>Throughput (CPI)</th></tr></thead><tbody><tr><td>vabsd.b</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vabsd.bu</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vabsd.d</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vabsd.du</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vabsd.h</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vabsd.hu</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vabsd.w</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vabsd.wu</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vadd.b</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vadd.d</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vadd.h</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vadd.q</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>vadd.w</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vadda.b</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>vadda.d</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>vadda.h</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>vadda.w</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>vaddi.bu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vaddi.du</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vaddi.hu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vaddi.wu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vaddwev.d.w</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vaddwev.d.wu</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vaddwev.d.wu.w</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vaddwev.h.b</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vaddwev.h.bu</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vaddwev.h.bu.b</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vaddwev.q.d</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>vaddwev.q.du</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>vaddwev.q.du.d</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>vaddwev.w.h</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vaddwev.w.hu</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vaddwev.w.hu.h</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vaddwod.d.w</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vaddwod.d.wu</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vaddwod.d.wu.w</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vaddwod.h.b</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vaddwod.h.bu</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vaddwod.h.bu.b</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vaddwod.q.d</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>vaddwod.q.du</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>vaddwod.q.du.d</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>vaddwod.w.h</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vaddwod.w.hu</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vaddwod.w.hu.h</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vand.v</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vandi.b</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vandn.v</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vavg.b</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vavg.bu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vavg.d</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vavg.du</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vavg.h</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vavg.hu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vavg.w</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vavg.wu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vavgr.b</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vavgr.bu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vavgr.d</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vavgr.du</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vavgr.h</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vavgr.hu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vavgr.w</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vavgr.wu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vbitclr.b</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vbitclr.d</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vbitclr.h</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vbitclr.w</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vbitclri.b</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vbitclri.d</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vbitclri.h</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vbitclri.w</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vbitrev.b</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vbitrev.d</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vbitrev.h</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vbitrev.w</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vbitrevi.b</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vbitrevi.d</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vbitrevi.h</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vbitrevi.w</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vbitsel.v</td><td>1</td><td>2</td><td>1</td><td>2</td></tr><tr><td>vbitseli.b</td><td>1</td><td>2</td><td>1</td><td>2</td></tr><tr><td>vbitset.b</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vbitset.d</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vbitset.h</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vbitset.w</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vbitseti.b</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vbitseti.d</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vbitseti.h</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vbitseti.w</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vbsll.v</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vbsrl.v</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vclo.b</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vclo.d</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vclo.h</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vclo.w</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vclz.b</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vclz.d</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vclz.h</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vclz.w</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vdiv.b</td><td>29, 32</td><td>0.06(1/15.5)</td><td>29, 32</td><td>0.06(1/17)</td></tr><tr><td>vdiv.bu</td><td>29, 33</td><td>0.06(1/16.5)</td><td>29, 36</td><td>0.06(1/18)</td></tr><tr><td>vdiv.d</td><td>8</td><td>0.25(1/4)</td><td>8, 18.5</td><td>0.11(1/9)</td></tr><tr><td>vdiv.du</td><td>8</td><td>0.25(1/4)</td><td>8, 18.5</td><td>0.11(1/9)</td></tr><tr><td>vdiv.h</td><td>17</td><td>0.12(1/8.5)</td><td>17, 21.5</td><td>0.09(1/11)</td></tr><tr><td>vdiv.hu</td><td>17, 22</td><td>0.11(1/9)</td><td>17, 21.5</td><td>0.07(1/14)</td></tr><tr><td>vdiv.w</td><td>11</td><td>0.18(1/5.5)</td><td>11, 17.5</td><td>0.09(1/11.5)</td></tr><tr><td>vdiv.wu</td><td>11</td><td>0.18(1/5.5)</td><td>11, 17.5</td><td>0.07(1/15)</td></tr><tr><td>vext2xv.d.b</td><td>3</td><td>4</td><td>3</td><td>2</td></tr><tr><td>vext2xv.d.h</td><td>3</td><td>4</td><td>3</td><td>2</td></tr><tr><td>vext2xv.d.w</td><td>3</td><td>4</td><td>3</td><td>2</td></tr><tr><td>vext2xv.du.bu</td><td>3</td><td>4</td><td>3</td><td>2</td></tr><tr><td>vext2xv.du.hu</td><td>3</td><td>4</td><td>3</td><td>2</td></tr><tr><td>vext2xv.du.wu</td><td>3</td><td>4</td><td>3</td><td>2</td></tr><tr><td>vext2xv.h.b</td><td>3</td><td>4</td><td>3</td><td>2</td></tr><tr><td>vext2xv.hu.bu</td><td>3</td><td>4</td><td>3</td><td>2</td></tr><tr><td>vext2xv.w.b</td><td>3</td><td>4</td><td>3</td><td>2</td></tr><tr><td>vext2xv.w.h</td><td>3</td><td>4</td><td>3</td><td>2</td></tr><tr><td>vext2xv.wu.bu</td><td>3</td><td>4</td><td>3</td><td>2</td></tr><tr><td>vext2xv.wu.hu</td><td>3</td><td>4</td><td>3</td><td>2</td></tr><tr><td>vexth.d.w</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vexth.du.wu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vexth.h.b</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vexth.hu.bu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vexth.q.d</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vexth.qu.du</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vexth.w.h</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vexth.wu.hu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vextl.q.d</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vextl.qu.du</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vextrins.b</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vextrins.d</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vextrins.h</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vextrins.w</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vfadd.d</td><td>3</td><td>4</td><td>5</td><td>2</td></tr><tr><td>vfadd.s</td><td>3</td><td>4</td><td>5</td><td>2</td></tr><tr><td>vfclass.d</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vfclass.s</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vfcmp.caf.d</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vfcmp.caf.s</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vfcmp.ceq.d</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vfcmp.ceq.s</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vfcmp.cle.d</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vfcmp.cle.s</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vfcmp.clt.d</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vfcmp.clt.s</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vfcmp.cne.d</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vfcmp.cne.s</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vfcmp.cor.d</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vfcmp.cor.s</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vfcmp.cueq.d</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vfcmp.cueq.s</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vfcmp.cule.d</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vfcmp.cule.s</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vfcmp.cult.d</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vfcmp.cult.s</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vfcmp.cun.d</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vfcmp.cun.s</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vfcmp.cune.d</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vfcmp.cune.s</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vfcmp.saf.d</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vfcmp.saf.s</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vfcmp.seq.d</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vfcmp.seq.s</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vfcmp.sle.d</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vfcmp.sle.s</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vfcmp.slt.d</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vfcmp.slt.s</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vfcmp.sne.d</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vfcmp.sne.s</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vfcmp.sor.d</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vfcmp.sor.s</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vfcmp.sueq.d</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vfcmp.sueq.s</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vfcmp.sule.d</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vfcmp.sule.s</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vfcmp.sult.d</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vfcmp.sult.s</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vfcmp.sun.d</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vfcmp.sun.s</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vfcmp.sune.d</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vfcmp.sune.s</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vfcvt.h.s</td><td>3</td><td>2</td><td>3</td><td>1</td></tr><tr><td>vfcvt.s.d</td><td>3</td><td>2</td><td>3</td><td>1</td></tr><tr><td>vfcvth.d.s</td><td>3</td><td>2</td><td>3</td><td>1</td></tr><tr><td>vfcvth.s.h</td><td>3</td><td>2</td><td>3</td><td>1</td></tr><tr><td>vfcvtl.d.s</td><td>3</td><td>2</td><td>3</td><td>1</td></tr><tr><td>vfcvtl.s.h</td><td>3</td><td>2</td><td>3</td><td>1</td></tr><tr><td>vfdiv.d</td><td>8, 21.5</td><td>0.25(1/4)</td><td>8, 16.5</td><td>0.08(1/12.5)</td></tr><tr><td>vfdiv.s</td><td>11</td><td>0.18(1/5.5)</td><td>11, 19.5</td><td>0.13(1/7.5)</td></tr><tr><td>vffint.d.l</td><td>4</td><td>4</td><td>4</td><td>2</td></tr><tr><td>vffint.d.lu</td><td>4</td><td>4</td><td>4</td><td>2</td></tr><tr><td>vffint.s.l</td><td>5</td><td>2</td><td>5</td><td>1</td></tr><tr><td>vffint.s.w</td><td>4</td><td>4</td><td>4</td><td>2</td></tr><tr><td>vffint.s.wu</td><td>4</td><td>4</td><td>4</td><td>2</td></tr><tr><td>vffinth.d.w</td><td>5</td><td>2</td><td>5</td><td>1</td></tr><tr><td>vffintl.d.w</td><td>5</td><td>2</td><td>5</td><td>1</td></tr><tr><td>vflogb.d</td><td>4</td><td>4</td><td>4</td><td>2</td></tr><tr><td>vflogb.s</td><td>4</td><td>4</td><td>4</td><td>2</td></tr><tr><td>vfmadd.d</td><td>5</td><td>2</td><td>5</td><td>2</td></tr><tr><td>vfmadd.s</td><td>5</td><td>2</td><td>5</td><td>2</td></tr><tr><td>vfmax.d</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vfmax.s</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vfmaxa.d</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vfmaxa.s</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vfmin.d</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vfmin.s</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vfmina.d</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vfmina.s</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vfmsub.d</td><td>5</td><td>2</td><td>5</td><td>2</td></tr><tr><td>vfmsub.s</td><td>5</td><td>2</td><td>5</td><td>2</td></tr><tr><td>vfmul.d</td><td>5</td><td>2</td><td>5</td><td>2</td></tr><tr><td>vfmul.s</td><td>5</td><td>2</td><td>5</td><td>2</td></tr><tr><td>vfnmadd.d</td><td>5</td><td>2</td><td>5</td><td>2</td></tr><tr><td>vfnmadd.s</td><td>5</td><td>2</td><td>5</td><td>2</td></tr><tr><td>vfnmsub.d</td><td>5</td><td>2</td><td>5</td><td>2</td></tr><tr><td>vfnmsub.s</td><td>5</td><td>2</td><td>5</td><td>2</td></tr><tr><td>vfrecip.d</td><td>8</td><td>0.25(1/4)</td><td>23</td><td>0.08(1/12)</td></tr><tr><td>vfrecip.s</td><td>11</td><td>0.18(1/5.5)</td><td>27</td><td>0.14(1/7)</td></tr><tr><td>vfrint.d</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>vfrint.s</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>vfrintrm.d</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>vfrintrm.s</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>vfrintrne.d</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>vfrintrne.s</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>vfrintrp.d</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>vfrintrp.s</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>vfrintrz.d</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>vfrintrz.s</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>vfrsqrt.d</td><td>15</td><td>0.04(1/26.5)</td><td>15</td><td>0.04(1/27.5)</td></tr><tr><td>vfrsqrt.s</td><td>17</td><td>0.05(1/19)</td><td>21</td><td>0.11(1/9)</td></tr><tr><td>vfrstp.b</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vfrstp.h</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vfrstpi.b</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vfrstpi.h</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vfsqrt.d</td><td>36</td><td>0.06(1/17.5)</td><td>36</td><td>0.05(1/18.5)</td></tr><tr><td>vfsqrt.s</td><td>11</td><td>0.08(1/12)</td><td>27</td><td>0.17(1/6)</td></tr><tr><td>vfsub.d</td><td>3</td><td>4</td><td>5</td><td>2</td></tr><tr><td>vfsub.s</td><td>3</td><td>4</td><td>5</td><td>2</td></tr><tr><td>vftint.l.d</td><td>4</td><td>4</td><td>4</td><td>2</td></tr><tr><td>vftint.lu.d</td><td>4</td><td>4</td><td>4</td><td>2</td></tr><tr><td>vftint.w.d</td><td>5</td><td>2</td><td>5</td><td>1</td></tr><tr><td>vftint.w.s</td><td>4</td><td>4</td><td>4</td><td>2</td></tr><tr><td>vftint.wu.s</td><td>4</td><td>4</td><td>4</td><td>2</td></tr><tr><td>vftinth.l.s</td><td>5</td><td>2</td><td>5</td><td>1</td></tr><tr><td>vftintl.l.s</td><td>5</td><td>2</td><td>5</td><td>1</td></tr><tr><td>vftintrm.l.d</td><td>4</td><td>4</td><td>4</td><td>2</td></tr><tr><td>vftintrm.w.d</td><td>5</td><td>2</td><td>5</td><td>1</td></tr><tr><td>vftintrm.w.s</td><td>4</td><td>4</td><td>4</td><td>2</td></tr><tr><td>vftintrmh.l.s</td><td>5</td><td>2</td><td>5</td><td>1</td></tr><tr><td>vftintrml.l.s</td><td>5</td><td>2</td><td>5</td><td>1</td></tr><tr><td>vftintrne.l.d</td><td>4</td><td>4</td><td>4</td><td>2</td></tr><tr><td>vftintrne.w.d</td><td>5</td><td>2</td><td>5</td><td>1</td></tr><tr><td>vftintrne.w.s</td><td>4</td><td>4</td><td>4</td><td>2</td></tr><tr><td>vftintrneh.l.s</td><td>5</td><td>2</td><td>5</td><td>1</td></tr><tr><td>vftintrnel.l.s</td><td>5</td><td>2</td><td>5</td><td>1</td></tr><tr><td>vftintrp.l.d</td><td>4</td><td>4</td><td>4</td><td>2</td></tr><tr><td>vftintrp.w.d</td><td>5</td><td>2</td><td>5</td><td>1</td></tr><tr><td>vftintrp.w.s</td><td>4</td><td>4</td><td>4</td><td>2</td></tr><tr><td>vftintrph.l.s</td><td>5</td><td>2</td><td>5</td><td>1</td></tr><tr><td>vftintrpl.l.s</td><td>5</td><td>2</td><td>5</td><td>1</td></tr><tr><td>vftintrz.l.d</td><td>4</td><td>4</td><td>4</td><td>2</td></tr><tr><td>vftintrz.lu.d</td><td>4</td><td>4</td><td>4</td><td>2</td></tr><tr><td>vftintrz.w.d</td><td>5</td><td>2</td><td>5</td><td>1</td></tr><tr><td>vftintrz.w.s</td><td>4</td><td>4</td><td>4</td><td>2</td></tr><tr><td>vftintrz.wu.s</td><td>4</td><td>4</td><td>4</td><td>2</td></tr><tr><td>vftintrzh.l.s</td><td>5</td><td>2</td><td>5</td><td>1</td></tr><tr><td>vftintrzl.l.s</td><td>5</td><td>2</td><td>5</td><td>1</td></tr><tr><td>vhaddw.d.w</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vhaddw.du.wu</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vhaddw.h.b</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vhaddw.hu.bu</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vhaddw.q.d</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>vhaddw.qu.du</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>vhaddw.w.h</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vhaddw.wu.hu</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vhsubw.d.w</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vhsubw.du.wu</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vhsubw.h.b</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vhsubw.hu.bu</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vhsubw.q.d</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>vhsubw.qu.du</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>vhsubw.w.h</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vhsubw.wu.hu</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vilvh.b</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vilvh.d</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vilvh.h</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vilvh.w</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vilvl.b</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vilvl.d</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vilvl.h</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vilvl.w</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vinsgr2vr.b</td><td>1</td><td>1</td><td>1</td><td>1</td></tr><tr><td>vinsgr2vr.d</td><td>1</td><td>1</td><td>1</td><td>1</td></tr><tr><td>vinsgr2vr.h</td><td>1</td><td>1</td><td>1</td><td>1</td></tr><tr><td>vinsgr2vr.w</td><td>1</td><td>1</td><td>1</td><td>1</td></tr><tr><td>vmadd.b</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>vmadd.d</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>vmadd.h</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>vmadd.w</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>vmaddwev.d.w</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>vmaddwev.d.wu</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>vmaddwev.d.wu.w</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>vmaddwev.h.b</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>vmaddwev.h.bu</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>vmaddwev.h.bu.b</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>vmaddwev.q.d</td><td>7</td><td>1.14</td><td>7</td><td>1.14</td></tr><tr><td>vmaddwev.q.du</td><td>7</td><td>1.14</td><td>7</td><td>1.14</td></tr><tr><td>vmaddwev.q.du.d</td><td>7</td><td>1.14</td><td>7</td><td>1.14</td></tr><tr><td>vmaddwev.w.h</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>vmaddwev.w.hu</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>vmaddwev.w.hu.h</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>vmaddwod.d.w</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>vmaddwod.d.wu</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>vmaddwod.d.wu.w</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>vmaddwod.h.b</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>vmaddwod.h.bu</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>vmaddwod.h.bu.b</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>vmaddwod.q.d</td><td>7</td><td>1.14</td><td>7</td><td>1.14</td></tr><tr><td>vmaddwod.q.du</td><td>7</td><td>1.14</td><td>7</td><td>1.14</td></tr><tr><td>vmaddwod.q.du.d</td><td>7</td><td>1.14</td><td>7</td><td>1.14</td></tr><tr><td>vmaddwod.w.h</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>vmaddwod.w.hu</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>vmaddwod.w.hu.h</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>vmax.b</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vmax.bu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vmax.d</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vmax.du</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vmax.h</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vmax.hu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vmax.w</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vmax.wu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vmaxi.b</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vmaxi.bu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vmaxi.d</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vmaxi.du</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vmaxi.h</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vmaxi.hu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vmaxi.w</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vmaxi.wu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vmin.b</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vmin.bu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vmin.d</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vmin.du</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vmin.h</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vmin.hu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vmin.w</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vmin.wu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vmini.b</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vmini.bu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vmini.d</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vmini.du</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vmini.h</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vmini.hu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vmini.w</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vmini.wu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vmod.b</td><td>29, 35</td><td>0.06(1/15.5)</td><td>29, 33</td><td>0.06(1/17)</td></tr><tr><td>vmod.bu</td><td>29, 37</td><td>0.06(1/17.5)</td><td>29, 33</td><td>0.05(1/19)</td></tr><tr><td>vmod.d</td><td>8, 10</td><td>0.25(1/4)</td><td>8, 10</td><td>0.11(1/9.5)</td></tr><tr><td>vmod.du</td><td>8, 10</td><td>0.25(1/4)</td><td>8, 10</td><td>0.11(1/9.5)</td></tr><tr><td>vmod.h</td><td>17, 21</td><td>0.12(1/8.5)</td><td>17, 21</td><td>0.09(1/11)</td></tr><tr><td>vmod.hu</td><td>17, 21</td><td>0.11(1/9.5)</td><td>17, 21</td><td>0.07(1/15)</td></tr><tr><td>vmod.w</td><td>11, 13</td><td>0.18(1/5.5)</td><td>11, 15</td><td>0.08(1/12)</td></tr><tr><td>vmod.wu</td><td>11, 13</td><td>0.18(1/5.5)</td><td>11, 15</td><td>0.06(1/16)</td></tr><tr><td>vmskgez.b</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vmskltz.b</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vmskltz.d</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vmskltz.h</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vmskltz.w</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vmsknz.b</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vmsub.b</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>vmsub.d</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>vmsub.h</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>vmsub.w</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>vmuh.b</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>vmuh.bu</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>vmuh.d</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>vmuh.du</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>vmuh.h</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>vmuh.hu</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>vmuh.w</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>vmuh.wu</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>vmul.b</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>vmul.d</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>vmul.h</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>vmul.w</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>vmulwev.d.w</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>vmulwev.d.wu</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>vmulwev.d.wu.w</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>vmulwev.h.b</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>vmulwev.h.bu</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>vmulwev.h.bu.b</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>vmulwev.q.d</td><td>7</td><td>2</td><td>7</td><td>2</td></tr><tr><td>vmulwev.q.du</td><td>7</td><td>2</td><td>7</td><td>2</td></tr><tr><td>vmulwev.q.du.d</td><td>7</td><td>2</td><td>7</td><td>2</td></tr><tr><td>vmulwev.w.h</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>vmulwev.w.hu</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>vmulwev.w.hu.h</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>vmulwod.d.w</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>vmulwod.d.wu</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>vmulwod.d.wu.w</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>vmulwod.h.b</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>vmulwod.h.bu</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>vmulwod.h.bu.b</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>vmulwod.q.d</td><td>7</td><td>2</td><td>7</td><td>2</td></tr><tr><td>vmulwod.q.du</td><td>7</td><td>2</td><td>7</td><td>2</td></tr><tr><td>vmulwod.q.du.d</td><td>7</td><td>2</td><td>7</td><td>2</td></tr><tr><td>vmulwod.w.h</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>vmulwod.w.hu</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>vmulwod.w.hu.h</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>vneg.b</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vneg.d</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vneg.h</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vneg.w</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vnor.v</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vnori.b</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vor.v</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vori.b</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vorn.v</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vpackev.b</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vpackev.d</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vpackev.h</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vpackev.w</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vpackod.b</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vpackod.d</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vpackod.h</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vpackod.w</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vpcnt.b</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vpcnt.d</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vpcnt.h</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vpcnt.w</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vpermi.w</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vpickev.b</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vpickev.d</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vpickev.h</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vpickev.w</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vpickod.b</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vpickod.d</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vpickod.h</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vpickod.w</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vpickve2gr.b</td><td>1</td><td>1</td><td>1</td><td>1</td></tr><tr><td>vpickve2gr.bu</td><td>1</td><td>1</td><td>1</td><td>1</td></tr><tr><td>vpickve2gr.d</td><td>1</td><td>1</td><td>1</td><td>1</td></tr><tr><td>vpickve2gr.du</td><td>1</td><td>1</td><td>1</td><td>1</td></tr><tr><td>vpickve2gr.h</td><td>1</td><td>1</td><td>1</td><td>1</td></tr><tr><td>vpickve2gr.hu</td><td>1</td><td>1</td><td>1</td><td>1</td></tr><tr><td>vpickve2gr.w</td><td>1</td><td>1</td><td>1</td><td>1</td></tr><tr><td>vpickve2gr.wu</td><td>1</td><td>1</td><td>1</td><td>1</td></tr><tr><td>vreplgr2vr.b</td><td>N/A</td><td>1</td><td>N/A</td><td>1</td></tr><tr><td>vreplgr2vr.d</td><td>N/A</td><td>1</td><td>N/A</td><td>1</td></tr><tr><td>vreplgr2vr.h</td><td>N/A</td><td>1</td><td>N/A</td><td>1</td></tr><tr><td>vreplgr2vr.w</td><td>N/A</td><td>1</td><td>N/A</td><td>1</td></tr><tr><td>vrepli.b</td><td>N/A</td><td>6</td><td>N/A</td><td>2</td></tr><tr><td>vrepli.d</td><td>N/A</td><td>4</td><td>N/A</td><td>2</td></tr><tr><td>vrepli.h</td><td>N/A</td><td>4</td><td>N/A</td><td>2</td></tr><tr><td>vrepli.w</td><td>N/A</td><td>4</td><td>N/A</td><td>2</td></tr><tr><td>vreplve.b</td><td>1</td><td>1</td><td>1</td><td>1</td></tr><tr><td>vreplve.d</td><td>1</td><td>1</td><td>1</td><td>1</td></tr><tr><td>vreplve.h</td><td>1</td><td>1</td><td>1</td><td>1</td></tr><tr><td>vreplve.w</td><td>1</td><td>1</td><td>1</td><td>1</td></tr><tr><td>vreplvei.b</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vreplvei.d</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vreplvei.h</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vreplvei.w</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vrotr.b</td><td>1</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vrotr.d</td><td>1</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vrotr.h</td><td>1</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vrotr.w</td><td>1</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vrotri.b</td><td>1</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vrotri.d</td><td>1</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vrotri.h</td><td>1</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vrotri.w</td><td>1</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vsadd.b</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vsadd.bu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vsadd.d</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vsadd.du</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vsadd.h</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vsadd.hu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vsadd.w</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vsadd.wu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vsat.b</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vsat.bu</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vsat.d</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vsat.du</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vsat.h</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vsat.hu</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vsat.w</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vsat.wu</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vseq.b</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vseq.d</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vseq.h</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vseq.w</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vseqi.b</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vseqi.d</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vseqi.h</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vseqi.w</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vsetallnez.b</td><td>N/A</td><td>2</td><td>N/A</td><td>2</td></tr><tr><td>vsetallnez.d</td><td>N/A</td><td>2</td><td>N/A</td><td>2</td></tr><tr><td>vsetallnez.h</td><td>N/A</td><td>2</td><td>N/A</td><td>2</td></tr><tr><td>vsetallnez.w</td><td>N/A</td><td>2</td><td>N/A</td><td>2</td></tr><tr><td>vsetanyeqz.b</td><td>N/A</td><td>2</td><td>N/A</td><td>2</td></tr><tr><td>vsetanyeqz.d</td><td>N/A</td><td>2</td><td>N/A</td><td>2</td></tr><tr><td>vsetanyeqz.h</td><td>N/A</td><td>2</td><td>N/A</td><td>2</td></tr><tr><td>vsetanyeqz.w</td><td>N/A</td><td>2</td><td>N/A</td><td>2</td></tr><tr><td>vseteqz.v</td><td>N/A</td><td>2</td><td>N/A</td><td>2</td></tr><tr><td>vsetnez.v</td><td>N/A</td><td>2</td><td>N/A</td><td>2</td></tr><tr><td>vshuf4i.b</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vshuf4i.d</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vshuf4i.h</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vshuf4i.w</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vshuf.b</td><td>1</td><td>2</td><td>1</td><td>2</td></tr><tr><td>vshuf.d</td><td>1</td><td>2</td><td>1</td><td>2</td></tr><tr><td>vshuf.h</td><td>1</td><td>2</td><td>1</td><td>2</td></tr><tr><td>vshuf.w</td><td>1</td><td>2</td><td>1</td><td>2</td></tr><tr><td>vsigncov.b</td><td>1</td><td>2</td><td>1</td><td>2</td></tr><tr><td>vsigncov.d</td><td>1</td><td>2</td><td>1</td><td>2</td></tr><tr><td>vsigncov.h</td><td>1</td><td>2</td><td>1</td><td>2</td></tr><tr><td>vsigncov.w</td><td>1</td><td>2</td><td>1</td><td>2</td></tr><tr><td>vsle.b</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vsle.bu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vsle.d</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vsle.du</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vsle.h</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vsle.hu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vsle.w</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vsle.wu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vslei.b</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vslei.bu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vslei.d</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vslei.du</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vslei.h</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vslei.hu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vslei.w</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vslei.wu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vsll.b</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vsll.d</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vsll.h</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vsll.w</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vslli.b</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vslli.d</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vslli.h</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vslli.w</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vsllwil.d.w</td><td>2</td><td>2</td><td>2</td><td>1</td></tr><tr><td>vsllwil.du.wu</td><td>2</td><td>2</td><td>2</td><td>1</td></tr><tr><td>vsllwil.h.b</td><td>2</td><td>2</td><td>2</td><td>1</td></tr><tr><td>vsllwil.hu.bu</td><td>2</td><td>2</td><td>2</td><td>1</td></tr><tr><td>vsllwil.w.h</td><td>2</td><td>2</td><td>2</td><td>1</td></tr><tr><td>vsllwil.wu.hu</td><td>2</td><td>2</td><td>2</td><td>1</td></tr><tr><td>vslt.b</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vslt.bu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vslt.d</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vslt.du</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vslt.h</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vslt.hu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vslt.w</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vslt.wu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vslti.b</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vslti.bu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vslti.d</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vslti.du</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>vslti.h</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vslti.hu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vslti.w</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vslti.wu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vsra.b</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vsra.d</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vsra.h</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vsra.w</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vsrai.b</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vsrai.d</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vsrai.h</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vsrai.w</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vsran.b.h</td><td>2</td><td>2</td><td>2</td><td>1</td></tr><tr><td>vsran.h.w</td><td>2</td><td>2</td><td>2</td><td>1</td></tr><tr><td>vsran.w.d</td><td>2</td><td>2</td><td>2</td><td>1</td></tr><tr><td>vsrani.b.h</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>vsrani.d.q</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>vsrani.h.w</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>vsrani.w.d</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>vsrar.b</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>vsrar.d</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>vsrar.h</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>vsrar.w</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>vsrari.b</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>vsrari.d</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>vsrari.h</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>vsrari.w</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>vsrarn.b.h</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>vsrarn.h.w</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>vsrarn.w.d</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>vsrarni.b.h</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>vsrarni.d.q</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>vsrarni.h.w</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>vsrarni.w.d</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>vsrl.b</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vsrl.d</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vsrl.h</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vsrl.w</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vsrli.b</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vsrli.d</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vsrli.h</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vsrli.w</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vsrln.b.h</td><td>2</td><td>2</td><td>2</td><td>1</td></tr><tr><td>vsrln.h.w</td><td>2</td><td>2</td><td>2</td><td>1</td></tr><tr><td>vsrln.w.d</td><td>2</td><td>2</td><td>2</td><td>1</td></tr><tr><td>vsrlni.b.h</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>vsrlni.d.q</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>vsrlni.h.w</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>vsrlni.w.d</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>vsrlr.b</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>vsrlr.d</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>vsrlr.h</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>vsrlr.w</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>vsrlri.b</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>vsrlri.d</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>vsrlri.h</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>vsrlri.w</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>vsrlrn.b.h</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>vsrlrn.h.w</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>vsrlrn.w.d</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>vsrlrni.b.h</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>vsrlrni.d.q</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>vsrlrni.h.w</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>vsrlrni.w.d</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>vssran.b.h</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>vssran.bu.h</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>vssran.h.w</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>vssran.hu.w</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>vssran.w.d</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>vssran.wu.d</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>vssrani.b.h</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>vssrani.bu.h</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>vssrani.d.q</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>vssrani.du.q</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>vssrani.h.w</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>vssrani.hu.w</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>vssrani.w.d</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>vssrani.wu.d</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>vssrarn.b.h</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>vssrarn.bu.h</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>vssrarn.h.w</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>vssrarn.hu.w</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>vssrarn.w.d</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>vssrarn.wu.d</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>vssrarni.b.h</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>vssrarni.bu.h</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>vssrarni.d.q</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>vssrarni.du.q</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>vssrarni.h.w</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>vssrarni.hu.w</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>vssrarni.w.d</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>vssrarni.wu.d</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>vssrln.b.h</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>vssrln.bu.h</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>vssrln.h.w</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>vssrln.hu.w</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>vssrln.w.d</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>vssrln.wu.d</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>vssrlni.b.h</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>vssrlni.bu.h</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>vssrlni.d.q</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>vssrlni.du.q</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>vssrlni.h.w</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>vssrlni.hu.w</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>vssrlni.w.d</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>vssrlni.wu.d</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>vssrlrn.b.h</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>vssrlrn.bu.h</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>vssrlrn.h.w</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>vssrlrn.hu.w</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>vssrlrn.w.d</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>vssrlrn.wu.d</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>vssrlrni.b.h</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>vssrlrni.bu.h</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>vssrlrni.d.q</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>vssrlrni.du.q</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>vssrlrni.h.w</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>vssrlrni.hu.w</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>vssrlrni.w.d</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>vssrlrni.wu.d</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>vssub.b</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vssub.bu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vssub.d</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vssub.du</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vssub.h</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vssub.hu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vssub.w</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vssub.wu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vsub.b</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vsub.d</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vsub.h</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vsub.q</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>vsub.w</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vsubi.bu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vsubi.du</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vsubi.hu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vsubi.wu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vsubwev.d.w</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vsubwev.d.wu</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vsubwev.h.b</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vsubwev.h.bu</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vsubwev.q.d</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>vsubwev.q.du</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>vsubwev.w.h</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vsubwev.w.hu</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vsubwod.d.w</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vsubwod.d.wu</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vsubwod.h.b</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vsubwod.h.bu</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vsubwod.q.d</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>vsubwod.q.du</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>vsubwod.w.h</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vsubwod.w.hu</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>vxor.v</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>vxori.b</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvabsd.b</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvabsd.bu</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvabsd.d</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvabsd.du</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvabsd.h</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvabsd.hu</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvabsd.w</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvabsd.wu</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvadd.b</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvadd.d</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvadd.h</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvadd.q</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>xvadd.w</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvadda.b</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>xvadda.d</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>xvadda.h</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>xvadda.w</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>xvaddi.bu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvaddi.du</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvaddi.hu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvaddi.wu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvaddwev.d.w</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvaddwev.d.wu</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvaddwev.d.wu.w</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvaddwev.h.b</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvaddwev.h.bu</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvaddwev.h.bu.b</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvaddwev.q.d</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>xvaddwev.q.du</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>xvaddwev.q.du.d</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>xvaddwev.w.h</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvaddwev.w.hu</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvaddwev.w.hu.h</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvaddwod.d.w</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvaddwod.d.wu</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvaddwod.d.wu.w</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvaddwod.h.b</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvaddwod.h.bu</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvaddwod.h.bu.b</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvaddwod.q.d</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>xvaddwod.q.du</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>xvaddwod.q.du.d</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>xvaddwod.w.h</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvaddwod.w.hu</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvaddwod.w.hu.h</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvand.v</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvandi.b</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvandn.v</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvavg.b</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvavg.bu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvavg.d</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvavg.du</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvavg.h</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvavg.hu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvavg.w</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvavg.wu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvavgr.b</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvavgr.bu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvavgr.d</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvavgr.du</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvavgr.h</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvavgr.hu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvavgr.w</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvavgr.wu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvbitclr.b</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvbitclr.d</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvbitclr.h</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvbitclr.w</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvbitclri.b</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvbitclri.d</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvbitclri.h</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvbitclri.w</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvbitrev.b</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvbitrev.d</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvbitrev.h</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvbitrev.w</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvbitrevi.b</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvbitrevi.d</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvbitrevi.h</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvbitrevi.w</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvbitsel.v</td><td>1</td><td>2</td><td>1</td><td>2</td></tr><tr><td>xvbitseli.b</td><td>1</td><td>2</td><td>1</td><td>2</td></tr><tr><td>xvbitset.b</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvbitset.d</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvbitset.h</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvbitset.w</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvbitseti.b</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvbitseti.d</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvbitseti.h</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvbitseti.w</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvbsll.v</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvbsrl.v</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvclo.b</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvclo.d</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvclo.h</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvclo.w</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvclz.b</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvclz.d</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvclz.h</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvclz.w</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvdiv.b</td><td>29, 32</td><td>0.06(1/15.5)</td><td>32, 36</td><td>0.05(1/20.5)</td></tr><tr><td>xvdiv.bu</td><td>29, 33</td><td>0.06(1/16.5)</td><td>29, 36</td><td>0.05(1/20.5)</td></tr><tr><td>xvdiv.d</td><td>8</td><td>0.25(1/4)</td><td>8, 18.5</td><td>0.11(1/9)</td></tr><tr><td>xvdiv.du</td><td>8</td><td>0.25(1/4)</td><td>8, 18.5</td><td>0.11(1/9)</td></tr><tr><td>xvdiv.h</td><td>17</td><td>0.12(1/8.5)</td><td>21.5, 22</td><td>0.08(1/13)</td></tr><tr><td>xvdiv.hu</td><td>17, 22</td><td>0.11(1/9)</td><td>17, 21.5</td><td>0.07(1/15)</td></tr><tr><td>xvdiv.w</td><td>11</td><td>0.18(1/5.5)</td><td>11, 17.5</td><td>0.09(1/11.5)</td></tr><tr><td>xvdiv.wu</td><td>11</td><td>0.18(1/5.5)</td><td>11, 17.5</td><td>0.07(1/15)</td></tr><tr><td>xvexth.d.w</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvexth.du.wu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvexth.h.b</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvexth.hu.bu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvexth.q.d</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvexth.qu.du</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvexth.w.h</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvexth.wu.hu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvextl.q.d</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvextl.qu.du</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvextrins.b</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvextrins.d</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvextrins.h</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvextrins.w</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvfadd.d</td><td>3</td><td>4</td><td>5</td><td>2</td></tr><tr><td>xvfadd.s</td><td>3</td><td>4</td><td>5</td><td>2</td></tr><tr><td>xvfclass.d</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvfclass.s</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvfcmp.caf.d</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvfcmp.caf.s</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvfcmp.ceq.d</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvfcmp.ceq.s</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvfcmp.cle.d</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvfcmp.cle.s</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvfcmp.clt.d</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvfcmp.clt.s</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvfcmp.cne.d</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvfcmp.cne.s</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvfcmp.cor.d</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvfcmp.cor.s</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvfcmp.cueq.d</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvfcmp.cueq.s</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvfcmp.cule.d</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvfcmp.cule.s</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvfcmp.cult.d</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvfcmp.cult.s</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvfcmp.cun.d</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvfcmp.cun.s</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvfcmp.cune.d</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvfcmp.cune.s</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvfcmp.saf.d</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvfcmp.saf.s</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvfcmp.seq.d</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvfcmp.seq.s</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvfcmp.sle.d</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvfcmp.sle.s</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvfcmp.slt.d</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvfcmp.slt.s</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvfcmp.sne.d</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvfcmp.sne.s</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvfcmp.sor.d</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvfcmp.sor.s</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvfcmp.sueq.d</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvfcmp.sueq.s</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvfcmp.sule.d</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvfcmp.sule.s</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvfcmp.sult.d</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvfcmp.sult.s</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvfcmp.sun.d</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvfcmp.sun.s</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvfcmp.sune.d</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvfcmp.sune.s</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvfcvt.h.s</td><td>3</td><td>2</td><td>3</td><td>1</td></tr><tr><td>xvfcvt.s.d</td><td>3</td><td>2</td><td>3</td><td>1</td></tr><tr><td>xvfcvth.d.s</td><td>3</td><td>2</td><td>3</td><td>1</td></tr><tr><td>xvfcvth.s.h</td><td>3</td><td>2</td><td>3</td><td>1</td></tr><tr><td>xvfcvtl.d.s</td><td>3</td><td>2</td><td>3</td><td>1</td></tr><tr><td>xvfcvtl.s.h</td><td>3</td><td>2</td><td>3</td><td>1</td></tr><tr><td>xvfdiv.d</td><td>8, 21.5</td><td>0.25(1/4)</td><td>8, 17</td><td>0.08(1/12.5)</td></tr><tr><td>xvfdiv.s</td><td>11</td><td>0.18(1/5.5)</td><td>11, 19.5</td><td>0.1(1/10.5)</td></tr><tr><td>xvffint.d.l</td><td>4</td><td>4</td><td>4</td><td>2</td></tr><tr><td>xvffint.d.lu</td><td>4</td><td>4</td><td>4</td><td>2</td></tr><tr><td>xvffint.s.l</td><td>5</td><td>2</td><td>5</td><td>1</td></tr><tr><td>xvffint.s.w</td><td>4</td><td>4</td><td>4</td><td>2</td></tr><tr><td>xvffint.s.wu</td><td>4</td><td>4</td><td>4</td><td>2</td></tr><tr><td>xvffinth.d.w</td><td>5</td><td>2</td><td>5</td><td>1</td></tr><tr><td>xvffintl.d.w</td><td>5</td><td>2</td><td>5</td><td>1</td></tr><tr><td>xvflogb.d</td><td>4</td><td>4</td><td>4</td><td>2</td></tr><tr><td>xvflogb.s</td><td>4</td><td>4</td><td>4</td><td>2</td></tr><tr><td>xvfmadd.d</td><td>5</td><td>2</td><td>5</td><td>2</td></tr><tr><td>xvfmadd.s</td><td>5</td><td>2</td><td>5</td><td>2</td></tr><tr><td>xvfmax.d</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvfmax.s</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvfmaxa.d</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvfmaxa.s</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvfmin.d</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvfmin.s</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvfmina.d</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvfmina.s</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvfmsub.d</td><td>5</td><td>2</td><td>5</td><td>2</td></tr><tr><td>xvfmsub.s</td><td>5</td><td>2</td><td>5</td><td>2</td></tr><tr><td>xvfmul.d</td><td>5</td><td>2</td><td>5</td><td>2</td></tr><tr><td>xvfmul.s</td><td>5</td><td>2</td><td>5</td><td>2</td></tr><tr><td>xvfnmadd.d</td><td>5</td><td>2</td><td>5</td><td>2</td></tr><tr><td>xvfnmadd.s</td><td>5</td><td>2</td><td>5</td><td>2</td></tr><tr><td>xvfnmsub.d</td><td>5</td><td>2</td><td>5</td><td>2</td></tr><tr><td>xvfnmsub.s</td><td>5</td><td>2</td><td>5</td><td>2</td></tr><tr><td>xvfrecip.d</td><td>23</td><td>0.25(1/4)</td><td>23</td><td>0.08(1/12)</td></tr><tr><td>xvfrecip.s</td><td>27</td><td>0.18(1/5.5)</td><td>27</td><td>0.14(1/7)</td></tr><tr><td>xvfrint.d</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>xvfrint.s</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>xvfrintrm.d</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>xvfrintrm.s</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>xvfrintrne.d</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>xvfrintrne.s</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>xvfrintrp.d</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>xvfrintrp.s</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>xvfrintrz.d</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>xvfrintrz.s</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>xvfrsqrt.d</td><td>15</td><td>0.04(1/26.5)</td><td>15</td><td>0.04(1/27.5)</td></tr><tr><td>xvfrsqrt.s</td><td>25</td><td>0.05(1/19)</td><td>25</td><td>0.03(1/32)</td></tr><tr><td>xvfrstp.b</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvfrstp.h</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvfrstpi.b</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvfrstpi.h</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvfsqrt.d</td><td>36</td><td>0.06(1/17.5)</td><td>36</td><td>0.05(1/18.5)</td></tr><tr><td>xvfsqrt.s</td><td>15</td><td>0.08(1/12)</td><td>15</td><td>0.07(1/13.5)</td></tr><tr><td>xvfsub.d</td><td>3</td><td>4</td><td>5</td><td>2</td></tr><tr><td>xvfsub.s</td><td>3</td><td>4</td><td>5</td><td>2</td></tr><tr><td>xvftint.l.d</td><td>4</td><td>4</td><td>4</td><td>2</td></tr><tr><td>xvftint.lu.d</td><td>4</td><td>4</td><td>4</td><td>2</td></tr><tr><td>xvftint.w.d</td><td>5</td><td>2</td><td>5</td><td>1</td></tr><tr><td>xvftint.w.s</td><td>4</td><td>4</td><td>4</td><td>2</td></tr><tr><td>xvftint.wu.s</td><td>4</td><td>4</td><td>4</td><td>2</td></tr><tr><td>xvftinth.l.s</td><td>5</td><td>2</td><td>5</td><td>1</td></tr><tr><td>xvftintl.l.s</td><td>5</td><td>2</td><td>5</td><td>1</td></tr><tr><td>xvftintrm.l.d</td><td>4</td><td>4</td><td>4</td><td>2</td></tr><tr><td>xvftintrm.w.d</td><td>5</td><td>2</td><td>5</td><td>1</td></tr><tr><td>xvftintrm.w.s</td><td>4</td><td>4</td><td>4</td><td>2</td></tr><tr><td>xvftintrmh.l.s</td><td>5</td><td>2</td><td>5</td><td>1</td></tr><tr><td>xvftintrml.l.s</td><td>5</td><td>2</td><td>5</td><td>1</td></tr><tr><td>xvftintrne.l.d</td><td>4</td><td>4</td><td>4</td><td>2</td></tr><tr><td>xvftintrne.w.d</td><td>5</td><td>2</td><td>5</td><td>1</td></tr><tr><td>xvftintrne.w.s</td><td>4</td><td>4</td><td>4</td><td>2</td></tr><tr><td>xvftintrneh.l.s</td><td>5</td><td>2</td><td>5</td><td>1</td></tr><tr><td>xvftintrnel.l.s</td><td>5</td><td>2</td><td>5</td><td>1</td></tr><tr><td>xvftintrp.l.d</td><td>4</td><td>4</td><td>4</td><td>2</td></tr><tr><td>xvftintrp.w.d</td><td>5</td><td>2</td><td>5</td><td>1</td></tr><tr><td>xvftintrp.w.s</td><td>4</td><td>4</td><td>4</td><td>2</td></tr><tr><td>xvftintrph.l.s</td><td>5</td><td>2</td><td>5</td><td>1</td></tr><tr><td>xvftintrpl.l.s</td><td>5</td><td>2</td><td>5</td><td>1</td></tr><tr><td>xvftintrz.l.d</td><td>4</td><td>4</td><td>4</td><td>2</td></tr><tr><td>xvftintrz.lu.d</td><td>4</td><td>4</td><td>4</td><td>2</td></tr><tr><td>xvftintrz.w.d</td><td>5</td><td>2</td><td>5</td><td>1</td></tr><tr><td>xvftintrz.w.s</td><td>4</td><td>4</td><td>4</td><td>2</td></tr><tr><td>xvftintrz.wu.s</td><td>4</td><td>4</td><td>4</td><td>2</td></tr><tr><td>xvftintrzh.l.s</td><td>5</td><td>2</td><td>5</td><td>1</td></tr><tr><td>xvftintrzl.l.s</td><td>5</td><td>2</td><td>5</td><td>1</td></tr><tr><td>xvhaddw.d.w</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvhaddw.du.wu</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvhaddw.h.b</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvhaddw.hu.bu</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvhaddw.q.d</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>xvhaddw.qu.du</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>xvhaddw.w.h</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvhaddw.wu.hu</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvhseli.d</td><td>1</td><td>1</td><td>1</td><td>1</td></tr><tr><td>xvhsubw.d.w</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvhsubw.du.wu</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvhsubw.h.b</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvhsubw.hu.bu</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvhsubw.q.d</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>xvhsubw.qu.du</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>xvhsubw.w.h</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvhsubw.wu.hu</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvilvh.b</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvilvh.d</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvilvh.h</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvilvh.w</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvilvl.b</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvilvl.d</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvilvl.h</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvilvl.w</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvinsgr2vr.d</td><td>1</td><td>1</td><td>1</td><td>1</td></tr><tr><td>xvinsgr2vr.w</td><td>1</td><td>1</td><td>1</td><td>1</td></tr><tr><td>xvinsve0.d</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvinsve0.w</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvmadd.b</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>xvmadd.d</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>xvmadd.h</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>xvmadd.w</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>xvmaddwev.d.w</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>xvmaddwev.d.wu</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>xvmaddwev.d.wu.w</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>xvmaddwev.h.b</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>xvmaddwev.h.bu</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>xvmaddwev.h.bu.b</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>xvmaddwev.q.d</td><td>7</td><td>1.14</td><td>7</td><td>1.14</td></tr><tr><td>xvmaddwev.q.du</td><td>7</td><td>1.14</td><td>7</td><td>1.14</td></tr><tr><td>xvmaddwev.q.du.d</td><td>7</td><td>1.14</td><td>7</td><td>1.14</td></tr><tr><td>xvmaddwev.w.h</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>xvmaddwev.w.hu</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>xvmaddwev.w.hu.h</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>xvmaddwod.d.w</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>xvmaddwod.d.wu</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>xvmaddwod.d.wu.w</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>xvmaddwod.h.b</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>xvmaddwod.h.bu</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>xvmaddwod.h.bu.b</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>xvmaddwod.q.d</td><td>7</td><td>1.14</td><td>7</td><td>1.14</td></tr><tr><td>xvmaddwod.q.du</td><td>7</td><td>1.14</td><td>7</td><td>1.14</td></tr><tr><td>xvmaddwod.q.du.d</td><td>7</td><td>1.14</td><td>7</td><td>1.14</td></tr><tr><td>xvmaddwod.w.h</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>xvmaddwod.w.hu</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>xvmaddwod.w.hu.h</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>xvmax.b</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvmax.bu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvmax.d</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvmax.du</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvmax.h</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvmax.hu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvmax.w</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvmax.wu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvmaxi.b</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvmaxi.bu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvmaxi.d</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvmaxi.du</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvmaxi.h</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvmaxi.hu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvmaxi.w</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvmaxi.wu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvmin.b</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvmin.bu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvmin.d</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvmin.du</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvmin.h</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvmin.hu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvmin.w</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvmin.wu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvmini.b</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvmini.bu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvmini.d</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvmini.du</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvmini.h</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvmini.hu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvmini.w</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvmini.wu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvmod.b</td><td>29, 41</td><td>0.06(1/15.5)</td><td>29, 33</td><td>0.05(1/21.5)</td></tr><tr><td>xvmod.bu</td><td>29, 37</td><td>0.06(1/17.5)</td><td>29, 37</td><td>0.05(1/22)</td></tr><tr><td>xvmod.d</td><td>8, 10</td><td>0.25(1/4)</td><td>8, 10</td><td>0.11(1/9.5)</td></tr><tr><td>xvmod.du</td><td>8, 10</td><td>0.25(1/4)</td><td>8, 10</td><td>0.11(1/9.5)</td></tr><tr><td>xvmod.h</td><td>17, 21</td><td>0.12(1/8.5)</td><td>17, 21</td><td>0.07(1/13.5)</td></tr><tr><td>xvmod.hu</td><td>17, 25</td><td>0.11(1/9.5)</td><td>17, 23</td><td>0.06(1/16)</td></tr><tr><td>xvmod.w</td><td>11, 13</td><td>0.18(1/5.5)</td><td>11, 15</td><td>0.07(1/13.5)</td></tr><tr><td>xvmod.wu</td><td>11, 13</td><td>0.18(1/5.5)</td><td>11, 15</td><td>0.06(1/16)</td></tr><tr><td>xvmskgez.b</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvmskltz.b</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvmskltz.d</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvmskltz.h</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvmskltz.w</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvmsknz.b</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvmsub.b</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>xvmsub.d</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>xvmsub.h</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>xvmsub.w</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>xvmuh.b</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>xvmuh.bu</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>xvmuh.d</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>xvmuh.du</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>xvmuh.h</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>xvmuh.hu</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>xvmuh.w</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>xvmuh.wu</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>xvmul.b</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>xvmul.d</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>xvmul.h</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>xvmul.w</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>xvmulwev.d.w</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>xvmulwev.d.wu</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>xvmulwev.d.wu.w</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>xvmulwev.h.b</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>xvmulwev.h.bu</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>xvmulwev.h.bu.b</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>xvmulwev.q.d</td><td>7</td><td>2</td><td>7</td><td>2</td></tr><tr><td>xvmulwev.q.du</td><td>7</td><td>2</td><td>7</td><td>2</td></tr><tr><td>xvmulwev.q.du.d</td><td>7</td><td>2</td><td>7</td><td>2</td></tr><tr><td>xvmulwev.w.h</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>xvmulwev.w.hu</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>xvmulwev.w.hu.h</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>xvmulwod.d.w</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>xvmulwod.d.wu</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>xvmulwod.d.wu.w</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>xvmulwod.h.b</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>xvmulwod.h.bu</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>xvmulwod.h.bu.b</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>xvmulwod.q.d</td><td>7</td><td>2</td><td>7</td><td>2</td></tr><tr><td>xvmulwod.q.du</td><td>7</td><td>2</td><td>7</td><td>2</td></tr><tr><td>xvmulwod.q.du.d</td><td>7</td><td>2</td><td>7</td><td>2</td></tr><tr><td>xvmulwod.w.h</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>xvmulwod.w.hu</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>xvmulwod.w.hu.h</td><td>4</td><td>2</td><td>4</td><td>2</td></tr><tr><td>xvneg.b</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvneg.d</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvneg.h</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvneg.w</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvnor.v</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvnori.b</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvor.v</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvori.b</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvorn.v</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvpackev.b</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvpackev.d</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvpackev.h</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvpackev.w</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvpackod.b</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvpackod.d</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvpackod.h</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvpackod.w</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvpcnt.b</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvpcnt.d</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvpcnt.h</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvpcnt.w</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvperm.w</td><td>3</td><td>4</td><td>3</td><td>2</td></tr><tr><td>xvpermi.d</td><td>3</td><td>4</td><td>3</td><td>2</td></tr><tr><td>xvpermi.q</td><td>3</td><td>2.67</td><td>3</td><td>2</td></tr><tr><td>xvpermi.w</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvpickev.b</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvpickev.d</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvpickev.h</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvpickev.w</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvpickod.b</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvpickod.d</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvpickod.h</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvpickod.w</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvpickve2gr.d</td><td>1</td><td>1</td><td>1</td><td>1</td></tr><tr><td>xvpickve2gr.du</td><td>1</td><td>1</td><td>1</td><td>1</td></tr><tr><td>xvpickve2gr.w</td><td>1</td><td>1</td><td>1</td><td>1</td></tr><tr><td>xvpickve2gr.wu</td><td>1</td><td>1</td><td>1</td><td>1</td></tr><tr><td>xvpickve.d</td><td>3</td><td>4</td><td>3</td><td>2</td></tr><tr><td>xvpickve.w</td><td>3</td><td>4</td><td>3</td><td>2</td></tr><tr><td>xvrepl128vei.b</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvrepl128vei.d</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvrepl128vei.h</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvrepl128vei.w</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvreplgr2vr.b</td><td>N/A</td><td>1</td><td>N/A</td><td>1</td></tr><tr><td>xvreplgr2vr.d</td><td>N/A</td><td>1</td><td>N/A</td><td>1</td></tr><tr><td>xvreplgr2vr.h</td><td>N/A</td><td>1</td><td>N/A</td><td>1</td></tr><tr><td>xvreplgr2vr.w</td><td>N/A</td><td>1</td><td>N/A</td><td>1</td></tr><tr><td>xvrepli.b</td><td>N/A</td><td>6</td><td>N/A</td><td>2</td></tr><tr><td>xvrepli.d</td><td>N/A</td><td>4</td><td>N/A</td><td>2</td></tr><tr><td>xvrepli.h</td><td>N/A</td><td>4</td><td>N/A</td><td>2</td></tr><tr><td>xvrepli.w</td><td>N/A</td><td>4</td><td>N/A</td><td>2</td></tr><tr><td>xvreplve0.b</td><td>3</td><td>4</td><td>3</td><td>2</td></tr><tr><td>xvreplve0.d</td><td>3</td><td>4</td><td>3</td><td>2</td></tr><tr><td>xvreplve0.h</td><td>3</td><td>4</td><td>3</td><td>2</td></tr><tr><td>xvreplve0.q</td><td>3</td><td>4</td><td>3</td><td>2</td></tr><tr><td>xvreplve0.w</td><td>3</td><td>4</td><td>3</td><td>2</td></tr><tr><td>xvreplve.b</td><td>1</td><td>1</td><td>1</td><td>1</td></tr><tr><td>xvreplve.d</td><td>1</td><td>1</td><td>1</td><td>1</td></tr><tr><td>xvreplve.h</td><td>1</td><td>1</td><td>1</td><td>1</td></tr><tr><td>xvreplve.w</td><td>1</td><td>1</td><td>1</td><td>1</td></tr><tr><td>xvrotr.b</td><td>1</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvrotr.d</td><td>1</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvrotr.h</td><td>1</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvrotr.w</td><td>1</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvrotri.b</td><td>1</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvrotri.d</td><td>1</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvrotri.h</td><td>1</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvrotri.w</td><td>1</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvsadd.b</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvsadd.bu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvsadd.d</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvsadd.du</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvsadd.h</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvsadd.hu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvsadd.w</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvsadd.wu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvsat.b</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvsat.bu</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvsat.d</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvsat.du</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvsat.h</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvsat.hu</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvsat.w</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvsat.wu</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvseq.b</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvseq.d</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvseq.h</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvseq.w</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvseqi.b</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvseqi.d</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvseqi.h</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvseqi.w</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvsetallnez.b</td><td>N/A</td><td>2</td><td>N/A</td><td>2</td></tr><tr><td>xvsetallnez.d</td><td>N/A</td><td>2</td><td>N/A</td><td>2</td></tr><tr><td>xvsetallnez.h</td><td>N/A</td><td>2</td><td>N/A</td><td>2</td></tr><tr><td>xvsetallnez.w</td><td>N/A</td><td>2</td><td>N/A</td><td>2</td></tr><tr><td>xvsetanyeqz.b</td><td>N/A</td><td>2</td><td>N/A</td><td>2</td></tr><tr><td>xvsetanyeqz.d</td><td>N/A</td><td>2</td><td>N/A</td><td>2</td></tr><tr><td>xvsetanyeqz.h</td><td>N/A</td><td>2</td><td>N/A</td><td>2</td></tr><tr><td>xvsetanyeqz.w</td><td>N/A</td><td>2</td><td>N/A</td><td>2</td></tr><tr><td>xvseteqz.v</td><td>N/A</td><td>2</td><td>N/A</td><td>2</td></tr><tr><td>xvsetnez.v</td><td>N/A</td><td>2</td><td>N/A</td><td>2</td></tr><tr><td>xvshuf4i.b</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvshuf4i.d</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvshuf4i.h</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvshuf4i.w</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvshuf.b</td><td>1</td><td>2</td><td>1</td><td>2</td></tr><tr><td>xvshuf.d</td><td>1</td><td>2</td><td>1</td><td>2</td></tr><tr><td>xvshuf.h</td><td>1</td><td>2</td><td>1</td><td>2</td></tr><tr><td>xvshuf.w</td><td>1</td><td>2</td><td>1</td><td>2</td></tr><tr><td>xvsigncov.b</td><td>1</td><td>2</td><td>1</td><td>2</td></tr><tr><td>xvsigncov.d</td><td>1</td><td>2</td><td>1</td><td>2</td></tr><tr><td>xvsigncov.h</td><td>1</td><td>2</td><td>1</td><td>2</td></tr><tr><td>xvsigncov.w</td><td>1</td><td>2</td><td>1</td><td>2</td></tr><tr><td>xvsle.b</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvsle.bu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvsle.d</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvsle.du</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvsle.h</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvsle.hu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvsle.w</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvsle.wu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvslei.b</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvslei.bu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvslei.d</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvslei.du</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvslei.h</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvslei.hu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvslei.w</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvslei.wu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvsll.b</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvsll.d</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvsll.h</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvsll.w</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvslli.b</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvslli.d</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvslli.h</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvslli.w</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvsllwil.d.w</td><td>2</td><td>2</td><td>2</td><td>1</td></tr><tr><td>xvsllwil.du.wu</td><td>2</td><td>2</td><td>2</td><td>1</td></tr><tr><td>xvsllwil.h.b</td><td>2</td><td>2</td><td>2</td><td>1</td></tr><tr><td>xvsllwil.hu.bu</td><td>2</td><td>2</td><td>2</td><td>1</td></tr><tr><td>xvsllwil.w.h</td><td>2</td><td>2</td><td>2</td><td>1</td></tr><tr><td>xvsllwil.wu.hu</td><td>2</td><td>2</td><td>2</td><td>1</td></tr><tr><td>xvslt.b</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvslt.bu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvslt.d</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvslt.du</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvslt.h</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvslt.hu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvslt.w</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvslt.wu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvslti.b</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvslti.bu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvslti.d</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvslti.du</td><td>2</td><td>4</td><td>2</td><td>2</td></tr><tr><td>xvslti.h</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvslti.hu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvslti.w</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvslti.wu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvsra.b</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvsra.d</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvsra.h</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvsra.w</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvsrai.b</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvsrai.d</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvsrai.h</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvsrai.w</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvsran.b.h</td><td>2</td><td>2</td><td>2</td><td>1</td></tr><tr><td>xvsran.h.w</td><td>2</td><td>2</td><td>2</td><td>1</td></tr><tr><td>xvsran.w.d</td><td>2</td><td>2</td><td>2</td><td>1</td></tr><tr><td>xvsrani.b.h</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>xvsrani.d.q</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>xvsrani.h.w</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>xvsrani.w.d</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>xvsrar.b</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>xvsrar.d</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>xvsrar.h</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>xvsrar.w</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>xvsrari.b</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>xvsrari.d</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>xvsrari.h</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>xvsrari.w</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>xvsrarn.b.h</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>xvsrarn.h.w</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>xvsrarn.w.d</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>xvsrarni.b.h</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>xvsrarni.d.q</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>xvsrarni.h.w</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>xvsrarni.w.d</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>xvsrl.b</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvsrl.d</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvsrl.h</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvsrl.w</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvsrli.b</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvsrli.d</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvsrli.h</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvsrli.w</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvsrln.b.h</td><td>2</td><td>2</td><td>2</td><td>1</td></tr><tr><td>xvsrln.h.w</td><td>2</td><td>2</td><td>2</td><td>1</td></tr><tr><td>xvsrln.w.d</td><td>2</td><td>2</td><td>2</td><td>1</td></tr><tr><td>xvsrlni.b.h</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>xvsrlni.d.q</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>xvsrlni.h.w</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>xvsrlni.w.d</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>xvsrlr.b</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>xvsrlr.d</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>xvsrlr.h</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>xvsrlr.w</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>xvsrlri.b</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>xvsrlri.d</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>xvsrlri.h</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>xvsrlri.w</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>xvsrlrn.b.h</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>xvsrlrn.h.w</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>xvsrlrn.w.d</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>xvsrlrni.b.h</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>xvsrlrni.d.q</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>xvsrlrni.h.w</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>xvsrlrni.w.d</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>xvssran.b.h</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>xvssran.bu.h</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>xvssran.h.w</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>xvssran.hu.w</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>xvssran.w.d</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>xvssran.wu.d</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>xvssrani.b.h</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>xvssrani.bu.h</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>xvssrani.d.q</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>xvssrani.du.q</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>xvssrani.h.w</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>xvssrani.hu.w</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>xvssrani.w.d</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>xvssrani.wu.d</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>xvssrarn.b.h</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>xvssrarn.bu.h</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>xvssrarn.h.w</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>xvssrarn.hu.w</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>xvssrarn.w.d</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>xvssrarn.wu.d</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>xvssrarni.b.h</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>xvssrarni.bu.h</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>xvssrarni.d.q</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>xvssrarni.du.q</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>xvssrarni.h.w</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>xvssrarni.hu.w</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>xvssrarni.w.d</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>xvssrarni.wu.d</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>xvssrln.b.h</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>xvssrln.bu.h</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>xvssrln.h.w</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>xvssrln.hu.w</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>xvssrln.w.d</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>xvssrln.wu.d</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>xvssrlni.b.h</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>xvssrlni.bu.h</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>xvssrlni.d.q</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>xvssrlni.du.q</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>xvssrlni.h.w</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>xvssrlni.hu.w</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>xvssrlni.w.d</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>xvssrlni.wu.d</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>xvssrlrn.b.h</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>xvssrlrn.bu.h</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>xvssrlrn.h.w</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>xvssrlrn.hu.w</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>xvssrlrn.w.d</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>xvssrlrn.wu.d</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>xvssrlrni.b.h</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>xvssrlrni.bu.h</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>xvssrlrni.d.q</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>xvssrlrni.du.q</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>xvssrlrni.h.w</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>xvssrlrni.hu.w</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>xvssrlrni.w.d</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>xvssrlrni.wu.d</td><td>4</td><td>2</td><td>4</td><td>1</td></tr><tr><td>xvssub.b</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvssub.bu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvssub.d</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvssub.du</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvssub.h</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvssub.hu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvssub.w</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvssub.wu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvsub.b</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvsub.d</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvsub.h</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvsub.q</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>xvsub.w</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvsubi.bu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvsubi.du</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvsubi.hu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvsubi.wu</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvsubwev.d.w</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvsubwev.d.wu</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvsubwev.h.b</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvsubwev.h.bu</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvsubwev.q.d</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>xvsubwev.q.du</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>xvsubwev.w.h</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvsubwev.w.hu</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvsubwod.d.w</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvsubwod.d.wu</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvsubwod.h.b</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvsubwod.h.bu</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvsubwod.q.d</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>xvsubwod.q.du</td><td>3</td><td>2</td><td>3</td><td>2</td></tr><tr><td>xvsubwod.w.h</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvsubwod.w.hu</td><td>2</td><td>2</td><td>2</td><td>2</td></tr><tr><td>xvxor.v</td><td>1</td><td>4</td><td>1</td><td>2</td></tr><tr><td>xvxori.b</td><td>1</td><td>4</td><td>1</td><td>2</td></tr></tbody>
+              
+            </div>
+          </div><footer>
+    <div class="rst-footer-buttons" role="navigation" aria-label="Footer Navigation">
+        <a href=".." class="btn btn-neutral float-left" title="Unofficial LoongArch Intrinsics Guide"><span class="icon icon-circle-arrow-left"></span> Previous</a>
+        <a href="../migrating_avx/" class="btn btn-neutral float-right" title="Migrating from AVX to LASX">Next <span class="icon icon-circle-arrow-right"></span></a>
+    </div>
+
+  <hr/>
+
+  <div role="contentinfo">
+    <!-- Copyright etc -->
+      <p>Copyright &copy; 2023 Jiajie Chen</p>
+  </div>
+
+  Built with <a href="https://www.mkdocs.org/">MkDocs</a> using a <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
+</footer>
+          
+        </div>
+      </div>
+
+    </section>
+
+  </div>
+
+  <div class="rst-versions" role="note" aria-label="Versions">
+  <span class="rst-current-version" data-toggle="rst-current-version">
+    
+    
+      <span><a href=".." style="color: #fcfcfc">&laquo; Previous</a></span>
+    
+    
+      <span><a href="../migrating_avx/" style="color: #fcfcfc">Next &raquo;</a></span>
+    
+  </span>
+</div>
+    <script src="../js/jquery-3.6.0.min.js"></script>
+    <script>var base_url = "..";</script>
+    <script src="../js/theme_extra.js"></script>
+    <script src="../js/theme.js"></script>
+    <script>
+        jQuery(function () {
+            SphinxRtdTheme.Navigation.enable(true);
+        });
+    </script>
+
+</body>
+</html>
diff --git a/lsx/bitwise_operations/index.html b/lsx/bitwise_operations/index.html
new file mode 100644
index 00000000..6fa9df32
--- /dev/null
+++ b/lsx/bitwise_operations/index.html
@@ -0,0 +1,2289 @@
+<!DOCTYPE html>
+<html class="writer-html5" lang="en" >
+<head>
+    <meta charset="utf-8" />
+    <meta http-equiv="X-UA-Compatible" content="IE=edge" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" /><link rel="canonical" href="https://jia.je/unofficial-loongarch-intrinsics-guide/lsx/bitwise_operations/" />
+      <link rel="shortcut icon" href="../../img/favicon.ico" />
+    <title>Bitwise Operations - Unofficial LoongArch Intrinsics Guide</title>
+    <link rel="stylesheet" href="../../css/theme.css" />
+    <link rel="stylesheet" href="../../css/theme_extra.css" />
+        <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.8.0/styles/github.min.css" />
+        <link href="../../main.css" rel="stylesheet" />
+    
+      <script>
+        // Current page data
+        var mkdocs_page_name = "Bitwise Operations";
+        var mkdocs_page_input_path = "lsx/bitwise_operations.md";
+        var mkdocs_page_url = "/unofficial-loongarch-intrinsics-guide/lsx/bitwise_operations/";
+      </script>
+    
+    <!--[if lt IE 9]>
+      <script src="../../js/html5shiv.min.js"></script>
+    <![endif]-->
+      <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.8.0/highlight.min.js"></script>
+      <script>hljs.highlightAll();</script> 
+</head>
+
+<body class="wy-body-for-nav" role="document">
+
+  <div class="wy-grid-for-nav">
+    <nav data-toggle="wy-nav-shift" class="wy-nav-side stickynav">
+    <div class="wy-side-scroll">
+      <div class="wy-side-nav-search">
+          <a href="../.." class="icon icon-home"> Unofficial LoongArch Intrinsics Guide
+        </a>
+      </div>
+
+      <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../..">Unofficial LoongArch Intrinsics Guide</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../latency_throughput/">Latency and Throughput of Instructions</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../migrating_avx/">Migrating from AVX to LASX</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../migrating_sse/">Migrating from SSE to LSX</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../viewer/">Browse All Intrinsics</a>
+                </li>
+              </ul>
+              <p class="caption"><span class="caption-text">Lasx</span></p>
+              <ul>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/bitwise_operations/">Bitwise Operations</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/branch/">Branch</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/float_comparison/">Floating Point Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/float_computation/">Floating Point Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/float_conversion/">Floating Point Conversion</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/float_misc/">Floating Point Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/fma/">Fused Multiply-Add</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/integer_comparison/">Integer Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/integer_computation/">Integer Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/logical/">Logical</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/memory/">Memory Load & Store</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/misc/">Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/permutation/">Permutation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/shift/">Shift</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/shuffling/">Shuffling</a>
+                  </li>
+              </ul>
+              <p class="caption"><span class="caption-text">Lsx</span></p>
+              <ul class="current">
+                  <li class="toctree-l1 current"><a class="reference internal current" href="./">Bitwise Operations</a>
+    <ul class="current">
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vbitsel_v-__m128i-a-__m128i-b-__m128i-c">__m128i __lsx_vbitsel_v (__m128i a, __m128i b, __m128i c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vbitseli_b-__m128i-a-__m128i-b-imm0_255-imm">__m128i __lsx_vbitseli_b (__m128i a, __m128i b, imm0_255 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_1">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_1">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_1">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_1">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_1">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vbitclr_b-__m128i-a-__m128i-b">__m128i __lsx_vbitclr_b (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_2">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_2">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_2">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_2">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_2">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vbitclr_h-__m128i-a-__m128i-b">__m128i __lsx_vbitclr_h (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_3">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_3">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_3">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_3">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_3">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vbitclr_w-__m128i-a-__m128i-b">__m128i __lsx_vbitclr_w (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_4">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_4">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_4">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_4">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_4">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vbitclr_d-__m128i-a-__m128i-b">__m128i __lsx_vbitclr_d (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_5">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_5">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_5">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_5">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_5">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vbitclri_b-__m128i-a-imm0_7-imm">__m128i __lsx_vbitclri_b (__m128i a, imm0_7 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_6">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_6">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_6">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_6">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_6">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vbitclri_h-__m128i-a-imm0_15-imm">__m128i __lsx_vbitclri_h (__m128i a, imm0_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_7">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_7">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_7">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_7">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_7">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vbitclri_w-__m128i-a-imm0_31-imm">__m128i __lsx_vbitclri_w (__m128i a, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_8">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_8">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_8">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_8">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_8">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vbitclri_d-__m128i-a-imm0_63-imm">__m128i __lsx_vbitclri_d (__m128i a, imm0_63 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_9">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_9">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_9">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_9">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_9">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vbitset_b-__m128i-a-__m128i-b">__m128i __lsx_vbitset_b (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_10">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_10">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_10">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_10">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_10">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vbitset_h-__m128i-a-__m128i-b">__m128i __lsx_vbitset_h (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_11">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_11">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_11">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_11">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_11">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vbitset_w-__m128i-a-__m128i-b">__m128i __lsx_vbitset_w (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_12">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_12">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_12">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_12">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_12">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vbitset_d-__m128i-a-__m128i-b">__m128i __lsx_vbitset_d (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_13">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_13">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_13">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_13">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_13">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vbitseti_b-__m128i-a-imm0_7-imm">__m128i __lsx_vbitseti_b (__m128i a, imm0_7 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_14">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_14">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_14">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_14">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_14">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vbitseti_h-__m128i-a-imm0_15-imm">__m128i __lsx_vbitseti_h (__m128i a, imm0_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_15">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_15">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_15">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_15">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_15">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vbitseti_w-__m128i-a-imm0_31-imm">__m128i __lsx_vbitseti_w (__m128i a, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_16">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_16">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_16">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_16">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_16">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vbitseti_d-__m128i-a-imm0_63-imm">__m128i __lsx_vbitseti_d (__m128i a, imm0_63 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_17">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_17">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_17">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_17">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_17">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vbitrev_b-__m128i-a-__m128i-b">__m128i __lsx_vbitrev_b (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_18">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_18">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_18">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_18">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_18">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vbitrev_h-__m128i-a-__m128i-b">__m128i __lsx_vbitrev_h (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_19">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_19">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_19">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_19">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_19">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vbitrev_w-__m128i-a-__m128i-b">__m128i __lsx_vbitrev_w (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_20">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_20">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_20">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_20">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_20">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vbitrev_d-__m128i-a-__m128i-b">__m128i __lsx_vbitrev_d (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_21">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_21">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_21">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_21">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_21">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vbitrevi_b-__m128i-a-imm0_7-imm">__m128i __lsx_vbitrevi_b (__m128i a, imm0_7 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_22">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_22">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_22">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_22">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_22">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vbitrevi_h-__m128i-a-imm0_15-imm">__m128i __lsx_vbitrevi_h (__m128i a, imm0_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_23">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_23">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_23">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_23">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_23">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vbitrevi_w-__m128i-a-imm0_31-imm">__m128i __lsx_vbitrevi_w (__m128i a, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_24">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_24">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_24">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_24">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_24">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vbitrevi_d-__m128i-a-imm0_63-imm">__m128i __lsx_vbitrevi_d (__m128i a, imm0_63 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_25">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_25">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_25">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_25">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_25">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vclo_b-__m128i-a">__m128i __lsx_vclo_b (__m128i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_26">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_26">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_26">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_26">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_26">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vclo_h-__m128i-a">__m128i __lsx_vclo_h (__m128i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_27">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_27">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_27">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_27">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_27">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vclo_w-__m128i-a">__m128i __lsx_vclo_w (__m128i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_28">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_28">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_28">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_28">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_28">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vclo_d-__m128i-a">__m128i __lsx_vclo_d (__m128i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_29">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_29">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_29">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_29">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_29">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vclz_b-__m128i-a">__m128i __lsx_vclz_b (__m128i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_30">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_30">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_30">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_30">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_30">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vclz_h-__m128i-a">__m128i __lsx_vclz_h (__m128i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_31">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_31">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_31">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_31">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_31">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vclz_w-__m128i-a">__m128i __lsx_vclz_w (__m128i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_32">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_32">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_32">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_32">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_32">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vclz_d-__m128i-a">__m128i __lsx_vclz_d (__m128i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_33">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_33">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_33">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_33">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_33">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vpcnt_b-__m128i-a">__m128i __lsx_vpcnt_b (__m128i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_34">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_34">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_34">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_34">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_34">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vpcnt_h-__m128i-a">__m128i __lsx_vpcnt_h (__m128i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_35">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_35">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_35">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_35">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_35">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vpcnt_w-__m128i-a">__m128i __lsx_vpcnt_w (__m128i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_36">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_36">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_36">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_36">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_36">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vpcnt_d-__m128i-a">__m128i __lsx_vpcnt_d (__m128i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_37">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_37">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_37">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_37">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_37">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    </ul>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../branch/">Branch</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_comparison/">Floating Point Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_computation/">Floating Point Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_conversion/">Floating Point Conversion</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_misc/">Floating Point Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../fma/">Fused Multiply-Add</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../integer_comparison/">Integer Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../integer_computation/">Integer Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../logical/">Logical</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../memory/">Memory Load & Store</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../misc/">Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../permutation/">Permutation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../shift/">Shift</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../shuffling/">Shuffling</a>
+                  </li>
+              </ul>
+      </div>
+    </div>
+    </nav>
+
+    <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
+      <nav class="wy-nav-top" role="navigation" aria-label="Mobile navigation menu">
+          <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
+          <a href="../..">Unofficial LoongArch Intrinsics Guide</a>
+        
+      </nav>
+      <div class="wy-nav-content">
+        <div class="rst-content"><div role="navigation" aria-label="breadcrumbs navigation">
+  <ul class="wy-breadcrumbs">
+    <li><a href="../.." class="icon icon-home" aria-label="Docs"></a></li>
+          <li class="breadcrumb-item">Lsx</li>
+      <li class="breadcrumb-item active">Bitwise Operations</li>
+    <li class="wy-breadcrumbs-aside">
+    </li>
+  </ul>
+  <hr/>
+</div>
+          <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
+            <div class="section" itemprop="articleBody">
+              
+                <h1 id="bitwise-operations">Bitwise Operations</h1>
+<h2 id="__m128i-__lsx_vbitsel_v-__m128i-a-__m128i-b-__m128i-c">__m128i __lsx_vbitsel_v (__m128i a, __m128i b, __m128i c)</h2>
+<h3 id="synopsis">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vbitsel_v (__m128i a, __m128i b, __m128i c)
+#include &lt;lsxintrin.h&gt;
+Instruction: vbitsel.v vr, vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description">Description</h3>
+<p>Compute bitwise selection: for each bit position, if the bit in <code>c</code> equals to one, copy the bit from <code>b</code> to <code>dst</code>, otherwise copy from <code>a</code>.</p>
+<h3 id="examples">Examples</h3>
+<pre><code class="language-c++">__m128i __lsx_vbitsel_v(__m128i{0x1122334455667788, 0x99aabbccddeeff00}, __m128i{0xabababababababab, 0x1234123443214321}, __m128i{0xffff0000aaaabbbb, 0x1111222233334444})
+= 0xabab3344ffeeefab 0x98ba9beccfedfb00
+</code></pre>
+<h3 id="operation">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = (c.dword[i] &amp; b.dword[i]) | (~c.dword[i] &amp; a.dword[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vbitseli_b-__m128i-a-__m128i-b-imm0_255-imm">__m128i __lsx_vbitseli_b (__m128i a, __m128i b, imm0_255 imm)</h2>
+<h3 id="synopsis_1">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vbitseli_b (__m128i a, __m128i b, imm0_255 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vbitseli.b vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_1">Description</h3>
+<p>Compute bitwise selection: for each bit position, if the bit in <code>a</code> equals to one, copy the bit from <code>imm</code> to <code>dst</code>, otherwise copy from <code>b</code>.</p>
+<h3 id="examples_1">Examples</h3>
+<pre><code class="language-c++">__m128i __lsx_vbitseli_b(__m128i{0x1122334455667788, 0x99aabbccddeeff00}, __m128i{0xabababababababab, 0x1234123443214321}, 0x12)
+= 0xba8b9aabba8b9a23 0x1216123012031221
+</code></pre>
+<h3 id="operation_1">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = (~a.byte[i] &amp; b.byte[i]) | (a.byte[i] &amp; (u8)imm);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_1">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vbitclr_b-__m128i-a-__m128i-b">__m128i __lsx_vbitclr_b (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_2">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vbitclr_b (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vbitclr.b vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_2">Description</h3>
+<p>Clear the bit specified by elements in <code>b</code> from 8-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>
+<h3 id="examples_2">Examples</h3>
+<pre><code class="language-c++">__m128i __lsx_vbitclr_b(__m128i{0xffffffffffffffff, 0x99aabbccddeeff00}, __m128i{0xabababababababab, 0x1234123443214321})
+= 0xf7f7f7f7f7f7f7f7 0x99aabbccd5ecf700
+</code></pre>
+<h3 id="operation_2">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = a.byte[i] &amp; (~((u8)1 &lt;&lt; (b.byte[i] % 8)));
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_2">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vbitclr_h-__m128i-a-__m128i-b">__m128i __lsx_vbitclr_h (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_3">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vbitclr_h (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vbitclr.h vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_3">Description</h3>
+<p>Clear the bit specified by elements in <code>b</code> from 16-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>
+<h3 id="examples_3">Examples</h3>
+<pre><code class="language-c++">__m128i __lsx_vbitclr_h(__m128i{0xffffffffffffffff, 0x99aabbccddeeff00}, __m128i{0xabababababababab, 0x1234123443214321})
+= 0xf7fff7fff7fff7ff 0x99aabbccddecff00
+</code></pre>
+<h3 id="operation_3">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = a.half[i] &amp; (~((u16)1 &lt;&lt; (b.half[i] % 16)));
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_3">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vbitclr_w-__m128i-a-__m128i-b">__m128i __lsx_vbitclr_w (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_4">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vbitclr_w (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vbitclr.w vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_4">Description</h3>
+<p>Clear the bit specified by elements in <code>b</code> from 32-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>
+<h3 id="examples_4">Examples</h3>
+<pre><code class="language-c++">__m128i __lsx_vbitclr_w(__m128i{0xffffffffffffffff, 0x99aabbccddeeff00}, __m128i{0xabababababababab, 0x1234123443214321})
+= 0xfffff7fffffff7ff 0x99aabbccddeeff00
+</code></pre>
+<h3 id="operation_4">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = a.word[i] &amp; (~((u32)1 &lt;&lt; (b.word[i] % 32)));
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_4">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vbitclr_d-__m128i-a-__m128i-b">__m128i __lsx_vbitclr_d (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_5">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vbitclr_d (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vbitclr.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_5">Description</h3>
+<p>Clear the bit specified by elements in <code>b</code> from 64-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>
+<h3 id="examples_5">Examples</h3>
+<pre><code class="language-c++">__m128i __lsx_vbitclr_d(__m128i{0xffffffffffffffff, 0x99aabbccddeeff00}, __m128i{0xabababababababab, 0x1234123443214321})
+= 0xfffff7ffffffffff 0x99aabbccddeeff00
+</code></pre>
+<h3 id="operation_5">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = a.dword[i] &amp; (~((u64)1 &lt;&lt; (b.dword[i] % 64)));
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_5">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vbitclri_b-__m128i-a-imm0_7-imm">__m128i __lsx_vbitclri_b (__m128i a, imm0_7 imm)</h2>
+<h3 id="synopsis_6">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vbitclri_b (__m128i a, imm0_7 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vbitclri.b vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_6">Description</h3>
+<p>Clear the bit specified by <code>imm</code> from 8-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>
+<h3 id="examples_6">Examples</h3>
+<pre><code class="language-c++">__m128i __lsx_vbitclri_b(__m128i{0xffffffffffffffff, 0x99aabbccddeeff00}, 1)
+= 0xfdfdfdfdfdfdfdfd 0x99a8b9ccddecfd00
+</code></pre>
+<h3 id="operation_6">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = a.byte[i] &amp; (~((u8)1 &lt;&lt; imm));
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_6">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vbitclri_h-__m128i-a-imm0_15-imm">__m128i __lsx_vbitclri_h (__m128i a, imm0_15 imm)</h2>
+<h3 id="synopsis_7">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vbitclri_h (__m128i a, imm0_15 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vbitclri.h vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_7">Description</h3>
+<p>Clear the bit specified by <code>imm</code> from 16-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>
+<h3 id="examples_7">Examples</h3>
+<pre><code class="language-c++">__m128i __lsx_vbitclri_h(__m128i{0xffffffffffffffff, 0x99aabbccddeeff00}, 1)
+= 0xfffdfffdfffdfffd 0x99a8bbccddecff00
+</code></pre>
+<h3 id="operation_7">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = a.half[i] &amp; (~((u16)1 &lt;&lt; imm));
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_7">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vbitclri_w-__m128i-a-imm0_31-imm">__m128i __lsx_vbitclri_w (__m128i a, imm0_31 imm)</h2>
+<h3 id="synopsis_8">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vbitclri_w (__m128i a, imm0_31 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vbitclri.w vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_8">Description</h3>
+<p>Clear the bit specified by <code>imm</code> from 32-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>
+<h3 id="examples_8">Examples</h3>
+<pre><code class="language-c++">__m128i __lsx_vbitclri_w(__m128i{0xffffffffffffffff, 0x99aabbccddeeff00}, 1)
+= 0xfffffffdfffffffd 0x99aabbccddeeff00
+</code></pre>
+<h3 id="operation_8">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = a.word[i] &amp; (~((u32)1 &lt;&lt; imm));
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_8">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vbitclri_d-__m128i-a-imm0_63-imm">__m128i __lsx_vbitclri_d (__m128i a, imm0_63 imm)</h2>
+<h3 id="synopsis_9">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vbitclri_d (__m128i a, imm0_63 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vbitclri.d vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_9">Description</h3>
+<p>Clear the bit specified by <code>imm</code> from 64-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>
+<h3 id="examples_9">Examples</h3>
+<pre><code class="language-c++">__m128i __lsx_vbitclri_d(__m128i{0xffffffffffffffff, 0x99aabbccddeeff00}, 1)
+= 0xfffffffffffffffd 0x99aabbccddeeff00
+</code></pre>
+<h3 id="operation_9">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = a.dword[i] &amp; (~((u64)1 &lt;&lt; imm));
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_9">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vbitset_b-__m128i-a-__m128i-b">__m128i __lsx_vbitset_b (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_10">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vbitset_b (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vbitset.b vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_10">Description</h3>
+<p>Set the bit specified by elements in <code>b</code> from 8-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>
+<h3 id="examples_10">Examples</h3>
+<pre><code class="language-c++">__m128i __lsx_vbitset_b(__m128i{0x0000000000000000, 0x99aabbccddeeff00}, __m128i{0xabababababababab, 0x1234123443214321})
+= 0x0808080808080808 0x9dbabfdcddeeff02
+</code></pre>
+<h3 id="operation_10">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = a.byte[i] | ((u8)1 &lt;&lt; (b.byte[i] % 8));
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_10">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vbitset_h-__m128i-a-__m128i-b">__m128i __lsx_vbitset_h (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_11">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vbitset_h (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vbitset.h vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_11">Description</h3>
+<p>Set the bit specified by elements in <code>b</code> from 16-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>
+<h3 id="examples_11">Examples</h3>
+<pre><code class="language-c++">__m128i __lsx_vbitset_h(__m128i{0x0000000000000000, 0x99aabbccddeeff00}, __m128i{0xabababababababab, 0x1234123443214321})
+= 0x0800080008000800 0x99babbdcddeeff02
+</code></pre>
+<h3 id="operation_11">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = a.half[i] | ((u16)1 &lt;&lt; (b.half[i] % 16));
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_11">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vbitset_w-__m128i-a-__m128i-b">__m128i __lsx_vbitset_w (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_12">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vbitset_w (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vbitset.w vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_12">Description</h3>
+<p>Set the bit specified by elements in <code>b</code> from 32-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>
+<h3 id="examples_12">Examples</h3>
+<pre><code class="language-c++">__m128i __lsx_vbitset_w(__m128i{0x0000000000000000, 0x99aabbccddeeff00}, __m128i{0xabababababababab, 0x1234123443214321})
+= 0x0000080000000800 0x99babbccddeeff02
+</code></pre>
+<h3 id="operation_12">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = a.word[i] | ((u32)1 &lt;&lt; (b.word[i] % 32));
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_12">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vbitset_d-__m128i-a-__m128i-b">__m128i __lsx_vbitset_d (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_13">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vbitset_d (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vbitset.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_13">Description</h3>
+<p>Set the bit specified by elements in <code>b</code> from 64-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>
+<h3 id="examples_13">Examples</h3>
+<pre><code class="language-c++">__m128i __lsx_vbitset_d(__m128i{0x0000000000000000, 0x99aabbccddeeff00}, __m128i{0xabababababababab, 0x1234123443214321})
+= 0x0000080000000000 0x99aabbceddeeff00
+</code></pre>
+<h3 id="operation_13">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = a.dword[i] | ((u64)1 &lt;&lt; (b.dword[i] % 64));
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_13">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vbitseti_b-__m128i-a-imm0_7-imm">__m128i __lsx_vbitseti_b (__m128i a, imm0_7 imm)</h2>
+<h3 id="synopsis_14">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vbitseti_b (__m128i a, imm0_7 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vbitseti.b vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_14">Description</h3>
+<p>Set the bit specified by <code>imm</code> from 8-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>
+<h3 id="examples_14">Examples</h3>
+<pre><code class="language-c++">__m128i __lsx_vbitseti_b(__m128i{0x0000000000000000, 0x99aabbccddeeff00}, 1)
+= 0x0202020202020202 0x9baabbcedfeeff02
+</code></pre>
+<h3 id="operation_14">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = a.byte[i] | ((u8)1 &lt;&lt; imm);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_14">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vbitseti_h-__m128i-a-imm0_15-imm">__m128i __lsx_vbitseti_h (__m128i a, imm0_15 imm)</h2>
+<h3 id="synopsis_15">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vbitseti_h (__m128i a, imm0_15 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vbitseti.h vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_15">Description</h3>
+<p>Set the bit specified by <code>imm</code> from 16-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>
+<h3 id="examples_15">Examples</h3>
+<pre><code class="language-c++">__m128i __lsx_vbitseti_h(__m128i{0x0000000000000000, 0x99aabbccddeeff00}, 1)
+= 0x0002000200020002 0x99aabbceddeeff02
+</code></pre>
+<h3 id="operation_15">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = a.half[i] | ((u16)1 &lt;&lt; imm);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_15">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vbitseti_w-__m128i-a-imm0_31-imm">__m128i __lsx_vbitseti_w (__m128i a, imm0_31 imm)</h2>
+<h3 id="synopsis_16">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vbitseti_w (__m128i a, imm0_31 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vbitseti.w vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_16">Description</h3>
+<p>Set the bit specified by <code>imm</code> from 32-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>
+<h3 id="examples_16">Examples</h3>
+<pre><code class="language-c++">__m128i __lsx_vbitseti_w(__m128i{0x0000000000000000, 0x99aabbccddeeff00}, 1)
+= 0x0000000200000002 0x99aabbceddeeff02
+</code></pre>
+<h3 id="operation_16">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = a.word[i] | ((u32)1 &lt;&lt; imm);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_16">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vbitseti_d-__m128i-a-imm0_63-imm">__m128i __lsx_vbitseti_d (__m128i a, imm0_63 imm)</h2>
+<h3 id="synopsis_17">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vbitseti_d (__m128i a, imm0_63 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vbitseti.d vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_17">Description</h3>
+<p>Set the bit specified by <code>imm</code> from 64-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>
+<h3 id="examples_17">Examples</h3>
+<pre><code class="language-c++">__m128i __lsx_vbitseti_d(__m128i{0x0000000000000000, 0x99aabbccddeeff00}, 1)
+= 0x0000000000000002 0x99aabbccddeeff02
+</code></pre>
+<h3 id="operation_17">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = a.dword[i] | ((u64)1 &lt;&lt; imm);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_17">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vbitrev_b-__m128i-a-__m128i-b">__m128i __lsx_vbitrev_b (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_18">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vbitrev_b (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vbitrev.b vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_18">Description</h3>
+<p>Toggle the bit specified by elements in <code>b</code> from 8-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>
+<h3 id="examples_18">Examples</h3>
+<pre><code class="language-c++">__m128i __lsx_vbitrev_b(__m128i{0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00}, __m128i{0xabababababababab, 0x1234123443214321})
+= 0x0707070707070707 0x9dbabfdcd5ecf702
+</code></pre>
+<h3 id="operation_18">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = a.byte[i] ^ ((u8)1 &lt;&lt; (b.byte[i] % 8));
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_18">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vbitrev_h-__m128i-a-__m128i-b">__m128i __lsx_vbitrev_h (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_19">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vbitrev_h (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vbitrev.h vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_19">Description</h3>
+<p>Toggle the bit specified by elements in <code>b</code> from 16-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>
+<h3 id="examples_19">Examples</h3>
+<pre><code class="language-c++">__m128i __lsx_vbitrev_h(__m128i{0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00}, __m128i{0xabababababababab, 0x1234123443214321})
+= 0x070f070f070f070f 0x99babbdcddecff02
+</code></pre>
+<h3 id="operation_19">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = a.half[i] ^ ((u16)1 &lt;&lt; (b.half[i] % 16));
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_19">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vbitrev_w-__m128i-a-__m128i-b">__m128i __lsx_vbitrev_w (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_20">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vbitrev_w (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vbitrev.w vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_20">Description</h3>
+<p>Toggle the bit specified by elements in <code>b</code> from 32-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>
+<h3 id="examples_20">Examples</h3>
+<pre><code class="language-c++">__m128i __lsx_vbitrev_w(__m128i{0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00}, __m128i{0xabababababababab, 0x1234123443214321})
+= 0x0f0f070f0f0f070f 0x99babbccddeeff02
+</code></pre>
+<h3 id="operation_20">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = a.word[i] ^ ((u32)1 &lt;&lt; (b.word[i] % 32));
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_20">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vbitrev_d-__m128i-a-__m128i-b">__m128i __lsx_vbitrev_d (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_21">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vbitrev_d (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vbitrev.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_21">Description</h3>
+<p>Toggle the bit specified by elements in <code>b</code> from 64-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>
+<h3 id="examples_21">Examples</h3>
+<pre><code class="language-c++">__m128i __lsx_vbitrev_d(__m128i{0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00}, __m128i{0xabababababababab, 0x1234123443214321})
+= 0x0f0f070f0f0f0f0f 0x99aabbceddeeff00
+</code></pre>
+<h3 id="operation_21">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = a.dword[i] ^ ((u64)1 &lt;&lt; (b.dword[i] % 64));
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_21">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vbitrevi_b-__m128i-a-imm0_7-imm">__m128i __lsx_vbitrevi_b (__m128i a, imm0_7 imm)</h2>
+<h3 id="synopsis_22">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vbitrevi_b (__m128i a, imm0_7 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vbitrevi.b vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_22">Description</h3>
+<p>Toggle the bit specified by <code>imm</code> from 8-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>
+<h3 id="examples_22">Examples</h3>
+<pre><code class="language-c++">__m128i __lsx_vbitrevi_b(__m128i{0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00}, 1)
+= 0x0d0d0d0d0d0d0d0d 0x9ba8b9cedfecfd02
+</code></pre>
+<h3 id="operation_22">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = a.byte[i] ^ ((u8)1 &lt;&lt; imm);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_22">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vbitrevi_h-__m128i-a-imm0_15-imm">__m128i __lsx_vbitrevi_h (__m128i a, imm0_15 imm)</h2>
+<h3 id="synopsis_23">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vbitrevi_h (__m128i a, imm0_15 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vbitrevi.h vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_23">Description</h3>
+<p>Toggle the bit specified by <code>imm</code> from 16-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>
+<h3 id="examples_23">Examples</h3>
+<pre><code class="language-c++">__m128i __lsx_vbitrevi_h(__m128i{0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00}, 1)
+= 0x0f0d0f0d0f0d0f0d 0x99a8bbceddecff02
+</code></pre>
+<h3 id="operation_23">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = a.half[i] ^ ((u16)1 &lt;&lt; imm);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_23">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vbitrevi_w-__m128i-a-imm0_31-imm">__m128i __lsx_vbitrevi_w (__m128i a, imm0_31 imm)</h2>
+<h3 id="synopsis_24">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vbitrevi_w (__m128i a, imm0_31 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vbitrevi.w vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_24">Description</h3>
+<p>Toggle the bit specified by <code>imm</code> from 32-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>
+<h3 id="examples_24">Examples</h3>
+<pre><code class="language-c++">__m128i __lsx_vbitrevi_w(__m128i{0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00}, 1)
+= 0x0f0f0f0d0f0f0f0d 0x99aabbceddeeff02
+</code></pre>
+<h3 id="operation_24">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = a.word[i] ^ ((u32)1 &lt;&lt; imm);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_24">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vbitrevi_d-__m128i-a-imm0_63-imm">__m128i __lsx_vbitrevi_d (__m128i a, imm0_63 imm)</h2>
+<h3 id="synopsis_25">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vbitrevi_d (__m128i a, imm0_63 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vbitrevi.d vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_25">Description</h3>
+<p>Toggle the bit specified by <code>imm</code> from 64-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>
+<h3 id="examples_25">Examples</h3>
+<pre><code class="language-c++">__m128i __lsx_vbitrevi_d(__m128i{0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00}, 1)
+= 0x0f0f0f0f0f0f0f0d 0x99aabbccddeeff02
+</code></pre>
+<h3 id="operation_25">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = a.dword[i] ^ ((u64)1 &lt;&lt; imm);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_25">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vclo_b-__m128i-a">__m128i __lsx_vclo_b (__m128i a)</h2>
+<h3 id="synopsis_26">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vclo_b (__m128i a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vclo.b vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_26">Description</h3>
+<p>Count leading ones of 8-bit elements in <code>a</code>.</p>
+<h3 id="examples_26">Examples</h3>
+<pre><code class="language-c++">__m128i __lsx_vclo_b(__m128i{0x1122334455667788, 0x99aabbccddeeff00})
+= 0x0000000000000001 0x0101010202030800
+</code></pre>
+<h3 id="operation_26">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = clo(a.byte[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_26">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vclo_h-__m128i-a">__m128i __lsx_vclo_h (__m128i a)</h2>
+<h3 id="synopsis_27">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vclo_h (__m128i a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vclo.h vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_27">Description</h3>
+<p>Count leading ones of 16-bit elements in <code>a</code>.</p>
+<h3 id="examples_27">Examples</h3>
+<pre><code class="language-c++">__m128i __lsx_vclo_h(__m128i{0x1122334455667788, 0x99aabbccddeeff00})
+= 0x0000000000000000 0x0001000100020008
+</code></pre>
+<h3 id="operation_27">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = clo(a.half[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_27">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vclo_w-__m128i-a">__m128i __lsx_vclo_w (__m128i a)</h2>
+<h3 id="synopsis_28">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vclo_w (__m128i a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vclo.w vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_28">Description</h3>
+<p>Count leading ones of 32-bit elements in <code>a</code>.</p>
+<h3 id="examples_28">Examples</h3>
+<pre><code class="language-c++">__m128i __lsx_vclo_w(__m128i{0x1122334455667788, 0x99aabbccddeeff00})
+= 0x0000000000000000 0x0000000100000002
+</code></pre>
+<h3 id="operation_28">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = clo(a.word[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_28">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vclo_d-__m128i-a">__m128i __lsx_vclo_d (__m128i a)</h2>
+<h3 id="synopsis_29">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vclo_d (__m128i a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vclo.d vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_29">Description</h3>
+<p>Count leading ones of 64-bit elements in <code>a</code>.</p>
+<h3 id="examples_29">Examples</h3>
+<pre><code class="language-c++">__m128i __lsx_vclo_d(__m128i{0x1122334455667788, 0x99aabbccddeeff00})
+= 0x0000000000000000 0x0000000000000001
+</code></pre>
+<h3 id="operation_29">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = clo(a.dword[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_29">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vclz_b-__m128i-a">__m128i __lsx_vclz_b (__m128i a)</h2>
+<h3 id="synopsis_30">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vclz_b (__m128i a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vclz.b vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_30">Description</h3>
+<p>Count leading zeros of 8-bit elements in <code>a</code>.</p>
+<h3 id="examples_30">Examples</h3>
+<pre><code class="language-c++">__m128i __lsx_vclz_b(__m128i{0x1122334455667788, 0x99aabbccddeeff00})
+= 0x0302020101010100 0x0000000000000008
+</code></pre>
+<h3 id="operation_30">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = clz(a.byte[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_30">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vclz_h-__m128i-a">__m128i __lsx_vclz_h (__m128i a)</h2>
+<h3 id="synopsis_31">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vclz_h (__m128i a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vclz.h vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_31">Description</h3>
+<p>Count leading zeros of 16-bit elements in <code>a</code>.</p>
+<h3 id="examples_31">Examples</h3>
+<pre><code class="language-c++">__m128i __lsx_vclz_h(__m128i{0x1122334455667788, 0x99aabbccddeeff00})
+= 0x0003000200010001 0x0000000000000000
+</code></pre>
+<h3 id="operation_31">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = clz(a.half[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_31">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vclz_w-__m128i-a">__m128i __lsx_vclz_w (__m128i a)</h2>
+<h3 id="synopsis_32">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vclz_w (__m128i a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vclz.w vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_32">Description</h3>
+<p>Count leading zeros of 32-bit elements in <code>a</code>.</p>
+<h3 id="examples_32">Examples</h3>
+<pre><code class="language-c++">__m128i __lsx_vclz_w(__m128i{0x1122334455667788, 0x99aabbccddeeff00})
+= 0x0000000300000001 0x0000000000000000
+</code></pre>
+<h3 id="operation_32">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = clz(a.word[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_32">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vclz_d-__m128i-a">__m128i __lsx_vclz_d (__m128i a)</h2>
+<h3 id="synopsis_33">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vclz_d (__m128i a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vclz.d vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_33">Description</h3>
+<p>Count leading zeros of 64-bit elements in <code>a</code>.</p>
+<h3 id="examples_33">Examples</h3>
+<pre><code class="language-c++">__m128i __lsx_vclz_d(__m128i{0x1122334455667788, 0x99aabbccddeeff00})
+= 0x0000000000000003 0x0000000000000000
+</code></pre>
+<h3 id="operation_33">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = clz(a.dword[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_33">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vpcnt_b-__m128i-a">__m128i __lsx_vpcnt_b (__m128i a)</h2>
+<h3 id="synopsis_34">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vpcnt_b (__m128i a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vpcnt.b vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_34">Description</h3>
+<p>Count the number of ones (population, popcount) in 8-bit elements in <code>a</code>.</p>
+<h3 id="examples_34">Examples</h3>
+<pre><code class="language-c++">__m128i __lsx_vpcnt_b(__m128i{0x1122334455667788, 0x99aabbccddeeff00})
+= 0x0202040204040602 0x0404060406060800
+</code></pre>
+<h3 id="operation_34">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = popcount(a.byte[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_34">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vpcnt_h-__m128i-a">__m128i __lsx_vpcnt_h (__m128i a)</h2>
+<h3 id="synopsis_35">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vpcnt_h (__m128i a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vpcnt.h vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_35">Description</h3>
+<p>Count the number of ones (population, popcount) in 16-bit elements in <code>a</code>.</p>
+<h3 id="examples_35">Examples</h3>
+<pre><code class="language-c++">__m128i __lsx_vpcnt_h(__m128i{0x1122334455667788, 0x99aabbccddeeff00})
+= 0x0004000600080008 0x0008000a000c0008
+</code></pre>
+<h3 id="operation_35">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = popcount(a.half[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_35">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vpcnt_w-__m128i-a">__m128i __lsx_vpcnt_w (__m128i a)</h2>
+<h3 id="synopsis_36">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vpcnt_w (__m128i a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vpcnt.w vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_36">Description</h3>
+<p>Count the number of ones (population, popcount) in 32-bit elements in <code>a</code>.</p>
+<h3 id="examples_36">Examples</h3>
+<pre><code class="language-c++">__m128i __lsx_vpcnt_w(__m128i{0x1122334455667788, 0x99aabbccddeeff00})
+= 0x0000000a00000010 0x0000001200000014
+</code></pre>
+<h3 id="operation_36">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = popcount(a.word[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_36">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vpcnt_d-__m128i-a">__m128i __lsx_vpcnt_d (__m128i a)</h2>
+<h3 id="synopsis_37">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vpcnt_d (__m128i a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vpcnt.d vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_37">Description</h3>
+<p>Count the number of ones (population, popcount) in 64-bit elements in <code>a</code>.</p>
+<h3 id="examples_37">Examples</h3>
+<pre><code class="language-c++">__m128i __lsx_vpcnt_d(__m128i{0x1122334455667788, 0x99aabbccddeeff00})
+= 0x000000000000001a 0x0000000000000026
+</code></pre>
+<h3 id="operation_37">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = popcount(a.dword[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_37">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+              
+            </div>
+          </div><footer>
+    <div class="rst-footer-buttons" role="navigation" aria-label="Footer Navigation">
+        <a href="../../lasx/shuffling/" class="btn btn-neutral float-left" title="Shuffling"><span class="icon icon-circle-arrow-left"></span> Previous</a>
+        <a href="../branch/" class="btn btn-neutral float-right" title="Branch">Next <span class="icon icon-circle-arrow-right"></span></a>
+    </div>
+
+  <hr/>
+
+  <div role="contentinfo">
+    <!-- Copyright etc -->
+      <p>Copyright &copy; 2023 Jiajie Chen</p>
+  </div>
+
+  Built with <a href="https://www.mkdocs.org/">MkDocs</a> using a <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
+</footer>
+          
+        </div>
+      </div>
+
+    </section>
+
+  </div>
+
+  <div class="rst-versions" role="note" aria-label="Versions">
+  <span class="rst-current-version" data-toggle="rst-current-version">
+    
+    
+      <span><a href="../../lasx/shuffling/" style="color: #fcfcfc">&laquo; Previous</a></span>
+    
+    
+      <span><a href="../branch/" style="color: #fcfcfc">Next &raquo;</a></span>
+    
+  </span>
+</div>
+    <script src="../../js/jquery-3.6.0.min.js"></script>
+    <script>var base_url = "../..";</script>
+    <script src="../../js/theme_extra.js"></script>
+    <script src="../../js/theme.js"></script>
+    <script>
+        jQuery(function () {
+            SphinxRtdTheme.Navigation.enable(true);
+        });
+    </script>
+
+</body>
+</html>
diff --git a/lsx/branch/index.html b/lsx/branch/index.html
new file mode 100644
index 00000000..3770b352
--- /dev/null
+++ b/lsx/branch/index.html
@@ -0,0 +1,709 @@
+<!DOCTYPE html>
+<html class="writer-html5" lang="en" >
+<head>
+    <meta charset="utf-8" />
+    <meta http-equiv="X-UA-Compatible" content="IE=edge" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" /><link rel="canonical" href="https://jia.je/unofficial-loongarch-intrinsics-guide/lsx/branch/" />
+      <link rel="shortcut icon" href="../../img/favicon.ico" />
+    <title>Branch - Unofficial LoongArch Intrinsics Guide</title>
+    <link rel="stylesheet" href="../../css/theme.css" />
+    <link rel="stylesheet" href="../../css/theme_extra.css" />
+        <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.8.0/styles/github.min.css" />
+        <link href="../../main.css" rel="stylesheet" />
+    
+      <script>
+        // Current page data
+        var mkdocs_page_name = "Branch";
+        var mkdocs_page_input_path = "lsx/branch.md";
+        var mkdocs_page_url = "/unofficial-loongarch-intrinsics-guide/lsx/branch/";
+      </script>
+    
+    <!--[if lt IE 9]>
+      <script src="../../js/html5shiv.min.js"></script>
+    <![endif]-->
+      <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.8.0/highlight.min.js"></script>
+      <script>hljs.highlightAll();</script> 
+</head>
+
+<body class="wy-body-for-nav" role="document">
+
+  <div class="wy-grid-for-nav">
+    <nav data-toggle="wy-nav-shift" class="wy-nav-side stickynav">
+    <div class="wy-side-scroll">
+      <div class="wy-side-nav-search">
+          <a href="../.." class="icon icon-home"> Unofficial LoongArch Intrinsics Guide
+        </a>
+      </div>
+
+      <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../..">Unofficial LoongArch Intrinsics Guide</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../latency_throughput/">Latency and Throughput of Instructions</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../migrating_avx/">Migrating from AVX to LASX</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../migrating_sse/">Migrating from SSE to LSX</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../viewer/">Browse All Intrinsics</a>
+                </li>
+              </ul>
+              <p class="caption"><span class="caption-text">Lasx</span></p>
+              <ul>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/bitwise_operations/">Bitwise Operations</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/branch/">Branch</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/float_comparison/">Floating Point Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/float_computation/">Floating Point Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/float_conversion/">Floating Point Conversion</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/float_misc/">Floating Point Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/fma/">Fused Multiply-Add</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/integer_comparison/">Integer Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/integer_computation/">Integer Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/logical/">Logical</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/memory/">Memory Load & Store</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/misc/">Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/permutation/">Permutation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/shift/">Shift</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/shuffling/">Shuffling</a>
+                  </li>
+              </ul>
+              <p class="caption"><span class="caption-text">Lsx</span></p>
+              <ul class="current">
+                  <li class="toctree-l1"><a class="reference internal" href="../bitwise_operations/">Bitwise Operations</a>
+                  </li>
+                  <li class="toctree-l1 current"><a class="reference internal current" href="./">Branch</a>
+    <ul class="current">
+    <li class="toctree-l2"><a class="reference internal" href="#int-__lsx_bz_v-__m128i-a">int __lsx_bz_v (__m128i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#int-__lsx_bnz_v-__m128i-a">int __lsx_bnz_v (__m128i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_1">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_1">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_1">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_1">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#int-__lsx_bz_b-__m128i-a">int __lsx_bz_b (__m128i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_2">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_2">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_2">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_2">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#int-__lsx_bz_h-__m128i-a">int __lsx_bz_h (__m128i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_3">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_3">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_3">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_3">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#int-__lsx_bz_w-__m128i-a">int __lsx_bz_w (__m128i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_4">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_4">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_4">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_4">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#int-__lsx_bz_d-__m128i-a">int __lsx_bz_d (__m128i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_5">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_5">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_5">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_5">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#int-__lsx_bnz_b-__m128i-a">int __lsx_bnz_b (__m128i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_6">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_6">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_6">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_6">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#int-__lsx_bnz_h-__m128i-a">int __lsx_bnz_h (__m128i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_7">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_7">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_7">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_7">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#int-__lsx_bnz_w-__m128i-a">int __lsx_bnz_w (__m128i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_8">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_8">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_8">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_8">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#int-__lsx_bnz_d-__m128i-a">int __lsx_bnz_d (__m128i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_9">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_9">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_9">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_9">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    </ul>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_comparison/">Floating Point Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_computation/">Floating Point Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_conversion/">Floating Point Conversion</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_misc/">Floating Point Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../fma/">Fused Multiply-Add</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../integer_comparison/">Integer Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../integer_computation/">Integer Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../logical/">Logical</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../memory/">Memory Load & Store</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../misc/">Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../permutation/">Permutation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../shift/">Shift</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../shuffling/">Shuffling</a>
+                  </li>
+              </ul>
+      </div>
+    </div>
+    </nav>
+
+    <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
+      <nav class="wy-nav-top" role="navigation" aria-label="Mobile navigation menu">
+          <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
+          <a href="../..">Unofficial LoongArch Intrinsics Guide</a>
+        
+      </nav>
+      <div class="wy-nav-content">
+        <div class="rst-content"><div role="navigation" aria-label="breadcrumbs navigation">
+  <ul class="wy-breadcrumbs">
+    <li><a href="../.." class="icon icon-home" aria-label="Docs"></a></li>
+          <li class="breadcrumb-item">Lsx</li>
+      <li class="breadcrumb-item active">Branch</li>
+    <li class="wy-breadcrumbs-aside">
+    </li>
+  </ul>
+  <hr/>
+</div>
+          <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
+            <div class="section" itemprop="articleBody">
+              
+                <h1 id="branch">Branch</h1>
+<h2 id="int-__lsx_bz_v-__m128i-a">int __lsx_bz_v (__m128i a)</h2>
+<h3 id="synopsis">Synopsis</h3>
+<pre><code class="language-c++">int __lsx_bz_v (__m128i a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vseteqz.v fcc, vr; bcnez
+CPU Flags: LSX
+</code></pre>
+<h3 id="description">Description</h3>
+<p>Expected to be used in branches: branch if the whole vector <code>a</code> equals to zero.</p>
+<h3 id="operation">Operation</h3>
+<pre><code class="language-c++">dst = a.qword[0] == 0;
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>N/A</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>N/A</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="int-__lsx_bnz_v-__m128i-a">int __lsx_bnz_v (__m128i a)</h2>
+<h3 id="synopsis_1">Synopsis</h3>
+<pre><code class="language-c++">int __lsx_bnz_v (__m128i a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsetnez.v fcc, vr; bcnez
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_1">Description</h3>
+<p>Expected to be used in branches: branch if the whole vector <code>a</code> is non-zero.</p>
+<h3 id="operation_1">Operation</h3>
+<pre><code class="language-c++">dst = a.qword[0] != 0;
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_1">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>N/A</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>N/A</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="int-__lsx_bz_b-__m128i-a">int __lsx_bz_b (__m128i a)</h2>
+<h3 id="synopsis_2">Synopsis</h3>
+<pre><code class="language-c++">int __lsx_bz_b (__m128i a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsetanyeqz.b fcc, vr; bcnez
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_2">Description</h3>
+<p>Expected to be used in branches: branch if any 8-bit element in <code>a</code> equals to zero.</p>
+<h3 id="operation_2">Operation</h3>
+<pre><code class="language-c++">dst = 0;
+for (int i = 0; i &lt; 16; i++) {
+  if (a.byte[i] == 0) {
+    dst = 1;
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_2">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>N/A</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>N/A</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="int-__lsx_bz_h-__m128i-a">int __lsx_bz_h (__m128i a)</h2>
+<h3 id="synopsis_3">Synopsis</h3>
+<pre><code class="language-c++">int __lsx_bz_h (__m128i a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsetanyeqz.h fcc, vr; bcnez
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_3">Description</h3>
+<p>Expected to be used in branches: branch if any 16-bit element in <code>a</code> equals to zero.</p>
+<h3 id="operation_3">Operation</h3>
+<pre><code class="language-c++">dst = 0;
+for (int i = 0; i &lt; 8; i++) {
+  if (a.half[i] == 0) {
+    dst = 1;
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_3">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>N/A</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>N/A</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="int-__lsx_bz_w-__m128i-a">int __lsx_bz_w (__m128i a)</h2>
+<h3 id="synopsis_4">Synopsis</h3>
+<pre><code class="language-c++">int __lsx_bz_w (__m128i a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsetanyeqz.w fcc, vr; bcnez
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_4">Description</h3>
+<p>Expected to be used in branches: branch if any 32-bit element in <code>a</code> equals to zero.</p>
+<h3 id="operation_4">Operation</h3>
+<pre><code class="language-c++">dst = 0;
+for (int i = 0; i &lt; 4; i++) {
+  if (a.word[i] == 0) {
+    dst = 1;
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_4">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>N/A</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>N/A</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="int-__lsx_bz_d-__m128i-a">int __lsx_bz_d (__m128i a)</h2>
+<h3 id="synopsis_5">Synopsis</h3>
+<pre><code class="language-c++">int __lsx_bz_d (__m128i a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsetanyeqz.d fcc, vr; bcnez
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_5">Description</h3>
+<p>Expected to be used in branches: branch if any 64-bit element in <code>a</code> equals to zero.</p>
+<h3 id="operation_5">Operation</h3>
+<pre><code class="language-c++">dst = 0;
+for (int i = 0; i &lt; 2; i++) {
+  if (a.dword[i] == 0) {
+    dst = 1;
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_5">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>N/A</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>N/A</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="int-__lsx_bnz_b-__m128i-a">int __lsx_bnz_b (__m128i a)</h2>
+<h3 id="synopsis_6">Synopsis</h3>
+<pre><code class="language-c++">int __lsx_bnz_b (__m128i a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsetallnez.b fcc, vr; bcnez
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_6">Description</h3>
+<p>Expected to be used in branches: branch if all 8-bit elements in <code>a</code> are non-zero.</p>
+<h3 id="operation_6">Operation</h3>
+<pre><code class="language-c++">dst = 1;
+for (int i = 0; i &lt; 16; i++) {
+  if (a.byte[i] == 0) {
+    dst = 0;
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_6">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>N/A</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>N/A</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="int-__lsx_bnz_h-__m128i-a">int __lsx_bnz_h (__m128i a)</h2>
+<h3 id="synopsis_7">Synopsis</h3>
+<pre><code class="language-c++">int __lsx_bnz_h (__m128i a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsetallnez.h fcc, vr; bcnez
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_7">Description</h3>
+<p>Expected to be used in branches: branch if all 16-bit elements in <code>a</code> are non-zero.</p>
+<h3 id="operation_7">Operation</h3>
+<pre><code class="language-c++">dst = 1;
+for (int i = 0; i &lt; 8; i++) {
+  if (a.half[i] == 0) {
+    dst = 0;
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_7">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>N/A</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>N/A</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="int-__lsx_bnz_w-__m128i-a">int __lsx_bnz_w (__m128i a)</h2>
+<h3 id="synopsis_8">Synopsis</h3>
+<pre><code class="language-c++">int __lsx_bnz_w (__m128i a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsetallnez.w fcc, vr; bcnez
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_8">Description</h3>
+<p>Expected to be used in branches: branch if all 32-bit elements in <code>a</code> are non-zero.</p>
+<h3 id="operation_8">Operation</h3>
+<pre><code class="language-c++">dst = 1;
+for (int i = 0; i &lt; 4; i++) {
+  if (a.word[i] == 0) {
+    dst = 0;
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_8">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>N/A</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>N/A</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="int-__lsx_bnz_d-__m128i-a">int __lsx_bnz_d (__m128i a)</h2>
+<h3 id="synopsis_9">Synopsis</h3>
+<pre><code class="language-c++">int __lsx_bnz_d (__m128i a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsetallnez.d fcc, vr; bcnez
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_9">Description</h3>
+<p>Expected to be used in branches: branch if all 64-bit elements in <code>a</code> are non-zero.</p>
+<h3 id="operation_9">Operation</h3>
+<pre><code class="language-c++">dst = 1;
+for (int i = 0; i &lt; 2; i++) {
+  if (a.dword[i] == 0) {
+    dst = 0;
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_9">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>N/A</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>N/A</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+              
+            </div>
+          </div><footer>
+    <div class="rst-footer-buttons" role="navigation" aria-label="Footer Navigation">
+        <a href="../bitwise_operations/" class="btn btn-neutral float-left" title="Bitwise Operations"><span class="icon icon-circle-arrow-left"></span> Previous</a>
+        <a href="../float_comparison/" class="btn btn-neutral float-right" title="Floating Point Comparison">Next <span class="icon icon-circle-arrow-right"></span></a>
+    </div>
+
+  <hr/>
+
+  <div role="contentinfo">
+    <!-- Copyright etc -->
+      <p>Copyright &copy; 2023 Jiajie Chen</p>
+  </div>
+
+  Built with <a href="https://www.mkdocs.org/">MkDocs</a> using a <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
+</footer>
+          
+        </div>
+      </div>
+
+    </section>
+
+  </div>
+
+  <div class="rst-versions" role="note" aria-label="Versions">
+  <span class="rst-current-version" data-toggle="rst-current-version">
+    
+    
+      <span><a href="../bitwise_operations/" style="color: #fcfcfc">&laquo; Previous</a></span>
+    
+    
+      <span><a href="../float_comparison/" style="color: #fcfcfc">Next &raquo;</a></span>
+    
+  </span>
+</div>
+    <script src="../../js/jquery-3.6.0.min.js"></script>
+    <script>var base_url = "../..";</script>
+    <script src="../../js/theme_extra.js"></script>
+    <script src="../../js/theme.js"></script>
+    <script>
+        jQuery(function () {
+            SphinxRtdTheme.Navigation.enable(true);
+        });
+    </script>
+
+</body>
+</html>
diff --git a/lsx/float_comparison/index.html b/lsx/float_comparison/index.html
new file mode 100644
index 00000000..f01ca71d
--- /dev/null
+++ b/lsx/float_comparison/index.html
@@ -0,0 +1,2443 @@
+<!DOCTYPE html>
+<html class="writer-html5" lang="en" >
+<head>
+    <meta charset="utf-8" />
+    <meta http-equiv="X-UA-Compatible" content="IE=edge" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" /><link rel="canonical" href="https://jia.je/unofficial-loongarch-intrinsics-guide/lsx/float_comparison/" />
+      <link rel="shortcut icon" href="../../img/favicon.ico" />
+    <title>Floating Point Comparison - Unofficial LoongArch Intrinsics Guide</title>
+    <link rel="stylesheet" href="../../css/theme.css" />
+    <link rel="stylesheet" href="../../css/theme_extra.css" />
+        <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.8.0/styles/github.min.css" />
+        <link href="../../main.css" rel="stylesheet" />
+    
+      <script>
+        // Current page data
+        var mkdocs_page_name = "Floating Point Comparison";
+        var mkdocs_page_input_path = "lsx/float_comparison.md";
+        var mkdocs_page_url = "/unofficial-loongarch-intrinsics-guide/lsx/float_comparison/";
+      </script>
+    
+    <!--[if lt IE 9]>
+      <script src="../../js/html5shiv.min.js"></script>
+    <![endif]-->
+      <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.8.0/highlight.min.js"></script>
+      <script>hljs.highlightAll();</script> 
+</head>
+
+<body class="wy-body-for-nav" role="document">
+
+  <div class="wy-grid-for-nav">
+    <nav data-toggle="wy-nav-shift" class="wy-nav-side stickynav">
+    <div class="wy-side-scroll">
+      <div class="wy-side-nav-search">
+          <a href="../.." class="icon icon-home"> Unofficial LoongArch Intrinsics Guide
+        </a>
+      </div>
+
+      <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../..">Unofficial LoongArch Intrinsics Guide</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../latency_throughput/">Latency and Throughput of Instructions</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../migrating_avx/">Migrating from AVX to LASX</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../migrating_sse/">Migrating from SSE to LSX</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../viewer/">Browse All Intrinsics</a>
+                </li>
+              </ul>
+              <p class="caption"><span class="caption-text">Lasx</span></p>
+              <ul>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/bitwise_operations/">Bitwise Operations</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/branch/">Branch</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/float_comparison/">Floating Point Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/float_computation/">Floating Point Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/float_conversion/">Floating Point Conversion</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/float_misc/">Floating Point Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/fma/">Fused Multiply-Add</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/integer_comparison/">Integer Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/integer_computation/">Integer Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/logical/">Logical</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/memory/">Memory Load & Store</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/misc/">Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/permutation/">Permutation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/shift/">Shift</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/shuffling/">Shuffling</a>
+                  </li>
+              </ul>
+              <p class="caption"><span class="caption-text">Lsx</span></p>
+              <ul class="current">
+                  <li class="toctree-l1"><a class="reference internal" href="../bitwise_operations/">Bitwise Operations</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../branch/">Branch</a>
+                  </li>
+                  <li class="toctree-l1 current"><a class="reference internal current" href="./">Floating Point Comparison</a>
+    <ul class="current">
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vfcmp_caf_s-__m128-a-__m128-b">__m128i __lsx_vfcmp_caf_s (__m128 a, __m128 b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vfcmp_caf_d-__m128d-a-__m128d-b">__m128i __lsx_vfcmp_caf_d (__m128d a, __m128d b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_1">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_1">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_1">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_1">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vfcmp_ceq_s-__m128-a-__m128-b">__m128i __lsx_vfcmp_ceq_s (__m128 a, __m128 b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_2">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_2">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_2">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_2">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vfcmp_ceq_d-__m128d-a-__m128d-b">__m128i __lsx_vfcmp_ceq_d (__m128d a, __m128d b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_3">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_3">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_3">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_3">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vfcmp_cle_s-__m128-a-__m128-b">__m128i __lsx_vfcmp_cle_s (__m128 a, __m128 b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_4">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_4">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_4">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_4">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vfcmp_cle_d-__m128d-a-__m128d-b">__m128i __lsx_vfcmp_cle_d (__m128d a, __m128d b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_5">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_5">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_5">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_5">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vfcmp_clt_s-__m128-a-__m128-b">__m128i __lsx_vfcmp_clt_s (__m128 a, __m128 b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_6">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_6">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_6">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_6">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vfcmp_clt_d-__m128d-a-__m128d-b">__m128i __lsx_vfcmp_clt_d (__m128d a, __m128d b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_7">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_7">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_7">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_7">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vfcmp_cne_s-__m128-a-__m128-b">__m128i __lsx_vfcmp_cne_s (__m128 a, __m128 b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_8">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_8">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_8">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_8">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vfcmp_cne_d-__m128d-a-__m128d-b">__m128i __lsx_vfcmp_cne_d (__m128d a, __m128d b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_9">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_9">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_9">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_9">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vfcmp_cor_s-__m128-a-__m128-b">__m128i __lsx_vfcmp_cor_s (__m128 a, __m128 b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_10">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_10">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_10">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_10">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vfcmp_cor_d-__m128d-a-__m128d-b">__m128i __lsx_vfcmp_cor_d (__m128d a, __m128d b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_11">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_11">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_11">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_11">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vfcmp_cueq_s-__m128-a-__m128-b">__m128i __lsx_vfcmp_cueq_s (__m128 a, __m128 b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_12">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_12">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_12">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_12">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vfcmp_cueq_d-__m128d-a-__m128d-b">__m128i __lsx_vfcmp_cueq_d (__m128d a, __m128d b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_13">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_13">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_13">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_13">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vfcmp_cule_s-__m128-a-__m128-b">__m128i __lsx_vfcmp_cule_s (__m128 a, __m128 b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_14">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_14">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_14">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_14">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vfcmp_cule_d-__m128d-a-__m128d-b">__m128i __lsx_vfcmp_cule_d (__m128d a, __m128d b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_15">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_15">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_15">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_15">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vfcmp_cult_s-__m128-a-__m128-b">__m128i __lsx_vfcmp_cult_s (__m128 a, __m128 b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_16">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_16">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_16">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_16">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vfcmp_cult_d-__m128d-a-__m128d-b">__m128i __lsx_vfcmp_cult_d (__m128d a, __m128d b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_17">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_17">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_17">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_17">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vfcmp_cun_s-__m128-a-__m128-b">__m128i __lsx_vfcmp_cun_s (__m128 a, __m128 b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_18">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_18">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_18">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_18">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vfcmp_cun_d-__m128d-a-__m128d-b">__m128i __lsx_vfcmp_cun_d (__m128d a, __m128d b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_19">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_19">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_19">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_19">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vfcmp_cune_s-__m128-a-__m128-b">__m128i __lsx_vfcmp_cune_s (__m128 a, __m128 b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_20">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_20">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_20">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_20">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vfcmp_cune_d-__m128d-a-__m128d-b">__m128i __lsx_vfcmp_cune_d (__m128d a, __m128d b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_21">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_21">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_21">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_21">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vfcmp_saf_s-__m128-a-__m128-b">__m128i __lsx_vfcmp_saf_s (__m128 a, __m128 b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_22">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_22">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_22">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_22">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vfcmp_saf_d-__m128d-a-__m128d-b">__m128i __lsx_vfcmp_saf_d (__m128d a, __m128d b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_23">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_23">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_23">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_23">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vfcmp_seq_s-__m128-a-__m128-b">__m128i __lsx_vfcmp_seq_s (__m128 a, __m128 b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_24">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_24">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_24">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_24">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vfcmp_seq_d-__m128d-a-__m128d-b">__m128i __lsx_vfcmp_seq_d (__m128d a, __m128d b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_25">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_25">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_25">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_25">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vfcmp_sle_s-__m128-a-__m128-b">__m128i __lsx_vfcmp_sle_s (__m128 a, __m128 b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_26">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_26">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_26">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_26">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vfcmp_sle_d-__m128d-a-__m128d-b">__m128i __lsx_vfcmp_sle_d (__m128d a, __m128d b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_27">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_27">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_27">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_27">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vfcmp_slt_s-__m128-a-__m128-b">__m128i __lsx_vfcmp_slt_s (__m128 a, __m128 b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_28">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_28">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_28">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_28">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vfcmp_slt_d-__m128d-a-__m128d-b">__m128i __lsx_vfcmp_slt_d (__m128d a, __m128d b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_29">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_29">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_29">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_29">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vfcmp_sne_s-__m128-a-__m128-b">__m128i __lsx_vfcmp_sne_s (__m128 a, __m128 b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_30">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_30">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_30">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_30">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vfcmp_sne_d-__m128d-a-__m128d-b">__m128i __lsx_vfcmp_sne_d (__m128d a, __m128d b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_31">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_31">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_31">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_31">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vfcmp_sor_s-__m128-a-__m128-b">__m128i __lsx_vfcmp_sor_s (__m128 a, __m128 b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_32">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_32">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_32">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_32">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vfcmp_sor_d-__m128d-a-__m128d-b">__m128i __lsx_vfcmp_sor_d (__m128d a, __m128d b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_33">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_33">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_33">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_33">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vfcmp_sueq_s-__m128-a-__m128-b">__m128i __lsx_vfcmp_sueq_s (__m128 a, __m128 b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_34">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_34">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_34">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_34">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vfcmp_sueq_d-__m128d-a-__m128d-b">__m128i __lsx_vfcmp_sueq_d (__m128d a, __m128d b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_35">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_35">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_35">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_35">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vfcmp_sule_s-__m128-a-__m128-b">__m128i __lsx_vfcmp_sule_s (__m128 a, __m128 b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_36">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_36">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_36">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_36">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vfcmp_sule_d-__m128d-a-__m128d-b">__m128i __lsx_vfcmp_sule_d (__m128d a, __m128d b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_37">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_37">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_37">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_37">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vfcmp_sult_s-__m128-a-__m128-b">__m128i __lsx_vfcmp_sult_s (__m128 a, __m128 b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_38">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_38">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_38">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_38">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vfcmp_sult_d-__m128d-a-__m128d-b">__m128i __lsx_vfcmp_sult_d (__m128d a, __m128d b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_39">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_39">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_39">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_39">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vfcmp_sun_s-__m128-a-__m128-b">__m128i __lsx_vfcmp_sun_s (__m128 a, __m128 b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_40">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_40">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_40">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_40">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vfcmp_sun_d-__m128d-a-__m128d-b">__m128i __lsx_vfcmp_sun_d (__m128d a, __m128d b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_41">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_41">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_41">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_41">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vfcmp_sune_s-__m128-a-__m128-b">__m128i __lsx_vfcmp_sune_s (__m128 a, __m128 b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_42">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_42">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_42">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_42">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vfcmp_sune_d-__m128d-a-__m128d-b">__m128i __lsx_vfcmp_sune_d (__m128d a, __m128d b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_43">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_43">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_43">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_43">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    </ul>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_computation/">Floating Point Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_conversion/">Floating Point Conversion</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_misc/">Floating Point Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../fma/">Fused Multiply-Add</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../integer_comparison/">Integer Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../integer_computation/">Integer Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../logical/">Logical</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../memory/">Memory Load & Store</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../misc/">Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../permutation/">Permutation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../shift/">Shift</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../shuffling/">Shuffling</a>
+                  </li>
+              </ul>
+      </div>
+    </div>
+    </nav>
+
+    <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
+      <nav class="wy-nav-top" role="navigation" aria-label="Mobile navigation menu">
+          <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
+          <a href="../..">Unofficial LoongArch Intrinsics Guide</a>
+        
+      </nav>
+      <div class="wy-nav-content">
+        <div class="rst-content"><div role="navigation" aria-label="breadcrumbs navigation">
+  <ul class="wy-breadcrumbs">
+    <li><a href="../.." class="icon icon-home" aria-label="Docs"></a></li>
+          <li class="breadcrumb-item">Lsx</li>
+      <li class="breadcrumb-item active">Floating Point Comparison</li>
+    <li class="wy-breadcrumbs-aside">
+    </li>
+  </ul>
+  <hr/>
+</div>
+          <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
+            <div class="section" itemprop="articleBody">
+              
+                <h1 id="floating-point-comparison">Floating Point Comparison</h1>
+<h2 id="__m128i-__lsx_vfcmp_caf_s-__m128-a-__m128-b">__m128i __lsx_vfcmp_caf_s (__m128 a, __m128 b)</h2>
+<h3 id="synopsis">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vfcmp_caf_s (__m128 a, __m128 b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfcmp.caf.s vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description">Description</h3>
+<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if AF(Always False), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>
+<h3 id="operation">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (fp_compare_caf(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vfcmp_caf_d-__m128d-a-__m128d-b">__m128i __lsx_vfcmp_caf_d (__m128d a, __m128d b)</h2>
+<h3 id="synopsis_1">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vfcmp_caf_d (__m128d a, __m128d b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfcmp.caf.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_1">Description</h3>
+<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if AF(Always False), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>
+<h3 id="operation_1">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  if (fp_compare_caf(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_1">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vfcmp_ceq_s-__m128-a-__m128-b">__m128i __lsx_vfcmp_ceq_s (__m128 a, __m128 b)</h2>
+<h3 id="synopsis_2">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vfcmp_ceq_s (__m128 a, __m128 b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfcmp.ceq.s vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_2">Description</h3>
+<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if EQ(Equal), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>
+<h3 id="operation_2">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (fp_compare_ceq(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_2">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vfcmp_ceq_d-__m128d-a-__m128d-b">__m128i __lsx_vfcmp_ceq_d (__m128d a, __m128d b)</h2>
+<h3 id="synopsis_3">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vfcmp_ceq_d (__m128d a, __m128d b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfcmp.ceq.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_3">Description</h3>
+<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if EQ(Equal), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>
+<h3 id="operation_3">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  if (fp_compare_ceq(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_3">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vfcmp_cle_s-__m128-a-__m128-b">__m128i __lsx_vfcmp_cle_s (__m128 a, __m128 b)</h2>
+<h3 id="synopsis_4">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vfcmp_cle_s (__m128 a, __m128 b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfcmp.cle.s vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_4">Description</h3>
+<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if LE(Less than or Equal), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>
+<h3 id="operation_4">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (fp_compare_cle(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_4">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vfcmp_cle_d-__m128d-a-__m128d-b">__m128i __lsx_vfcmp_cle_d (__m128d a, __m128d b)</h2>
+<h3 id="synopsis_5">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vfcmp_cle_d (__m128d a, __m128d b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfcmp.cle.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_5">Description</h3>
+<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if LE(Less than or Equal), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>
+<h3 id="operation_5">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  if (fp_compare_cle(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_5">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vfcmp_clt_s-__m128-a-__m128-b">__m128i __lsx_vfcmp_clt_s (__m128 a, __m128 b)</h2>
+<h3 id="synopsis_6">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vfcmp_clt_s (__m128 a, __m128 b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfcmp.clt.s vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_6">Description</h3>
+<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if LT(Less than), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>
+<h3 id="operation_6">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (fp_compare_clt(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_6">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vfcmp_clt_d-__m128d-a-__m128d-b">__m128i __lsx_vfcmp_clt_d (__m128d a, __m128d b)</h2>
+<h3 id="synopsis_7">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vfcmp_clt_d (__m128d a, __m128d b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfcmp.clt.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_7">Description</h3>
+<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if LT(Less than), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>
+<h3 id="operation_7">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  if (fp_compare_clt(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_7">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vfcmp_cne_s-__m128-a-__m128-b">__m128i __lsx_vfcmp_cne_s (__m128 a, __m128 b)</h2>
+<h3 id="synopsis_8">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vfcmp_cne_s (__m128 a, __m128 b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfcmp.cne.s vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_8">Description</h3>
+<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if NE(Not Equal), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>
+<h3 id="operation_8">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (fp_compare_cne(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_8">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vfcmp_cne_d-__m128d-a-__m128d-b">__m128i __lsx_vfcmp_cne_d (__m128d a, __m128d b)</h2>
+<h3 id="synopsis_9">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vfcmp_cne_d (__m128d a, __m128d b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfcmp.cne.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_9">Description</h3>
+<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if NE(Not Equal), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>
+<h3 id="operation_9">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  if (fp_compare_cne(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_9">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vfcmp_cor_s-__m128-a-__m128-b">__m128i __lsx_vfcmp_cor_s (__m128 a, __m128 b)</h2>
+<h3 id="synopsis_10">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vfcmp_cor_s (__m128 a, __m128 b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfcmp.cor.s vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_10">Description</h3>
+<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if OR(Ordered), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>
+<h3 id="operation_10">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (fp_compare_cor(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_10">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vfcmp_cor_d-__m128d-a-__m128d-b">__m128i __lsx_vfcmp_cor_d (__m128d a, __m128d b)</h2>
+<h3 id="synopsis_11">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vfcmp_cor_d (__m128d a, __m128d b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfcmp.cor.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_11">Description</h3>
+<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if OR(Ordered), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>
+<h3 id="operation_11">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  if (fp_compare_cor(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_11">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vfcmp_cueq_s-__m128-a-__m128-b">__m128i __lsx_vfcmp_cueq_s (__m128 a, __m128 b)</h2>
+<h3 id="synopsis_12">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vfcmp_cueq_s (__m128 a, __m128 b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfcmp.cueq.s vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_12">Description</h3>
+<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if UEQ(Unordered or Equal), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>
+<h3 id="operation_12">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (fp_compare_cueq(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_12">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vfcmp_cueq_d-__m128d-a-__m128d-b">__m128i __lsx_vfcmp_cueq_d (__m128d a, __m128d b)</h2>
+<h3 id="synopsis_13">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vfcmp_cueq_d (__m128d a, __m128d b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfcmp.cueq.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_13">Description</h3>
+<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if UEQ(Unordered or Equal), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>
+<h3 id="operation_13">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  if (fp_compare_cueq(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_13">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vfcmp_cule_s-__m128-a-__m128-b">__m128i __lsx_vfcmp_cule_s (__m128 a, __m128 b)</h2>
+<h3 id="synopsis_14">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vfcmp_cule_s (__m128 a, __m128 b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfcmp.cule.s vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_14">Description</h3>
+<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if ULE(Unordered, Less than or Equal), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>
+<h3 id="operation_14">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (fp_compare_cule(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_14">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vfcmp_cule_d-__m128d-a-__m128d-b">__m128i __lsx_vfcmp_cule_d (__m128d a, __m128d b)</h2>
+<h3 id="synopsis_15">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vfcmp_cule_d (__m128d a, __m128d b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfcmp.cule.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_15">Description</h3>
+<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if ULE(Unordered, Less than or Equal), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>
+<h3 id="operation_15">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  if (fp_compare_cule(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_15">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vfcmp_cult_s-__m128-a-__m128-b">__m128i __lsx_vfcmp_cult_s (__m128 a, __m128 b)</h2>
+<h3 id="synopsis_16">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vfcmp_cult_s (__m128 a, __m128 b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfcmp.cult.s vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_16">Description</h3>
+<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if ULT(Unordered or Less than), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>
+<h3 id="operation_16">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (fp_compare_cult(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_16">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vfcmp_cult_d-__m128d-a-__m128d-b">__m128i __lsx_vfcmp_cult_d (__m128d a, __m128d b)</h2>
+<h3 id="synopsis_17">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vfcmp_cult_d (__m128d a, __m128d b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfcmp.cult.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_17">Description</h3>
+<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if ULT(Unordered or Less than), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>
+<h3 id="operation_17">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  if (fp_compare_cult(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_17">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vfcmp_cun_s-__m128-a-__m128-b">__m128i __lsx_vfcmp_cun_s (__m128 a, __m128 b)</h2>
+<h3 id="synopsis_18">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vfcmp_cun_s (__m128 a, __m128 b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfcmp.cun.s vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_18">Description</h3>
+<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if UN(Unordered), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>
+<h3 id="operation_18">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (fp_compare_cun(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_18">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vfcmp_cun_d-__m128d-a-__m128d-b">__m128i __lsx_vfcmp_cun_d (__m128d a, __m128d b)</h2>
+<h3 id="synopsis_19">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vfcmp_cun_d (__m128d a, __m128d b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfcmp.cun.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_19">Description</h3>
+<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if UN(Unordered), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>
+<h3 id="operation_19">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  if (fp_compare_cun(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_19">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vfcmp_cune_s-__m128-a-__m128-b">__m128i __lsx_vfcmp_cune_s (__m128 a, __m128 b)</h2>
+<h3 id="synopsis_20">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vfcmp_cune_s (__m128 a, __m128 b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfcmp.cune.s vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_20">Description</h3>
+<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if UNE(Unordered or Not Equal), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>
+<h3 id="operation_20">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (fp_compare_cune(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_20">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vfcmp_cune_d-__m128d-a-__m128d-b">__m128i __lsx_vfcmp_cune_d (__m128d a, __m128d b)</h2>
+<h3 id="synopsis_21">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vfcmp_cune_d (__m128d a, __m128d b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfcmp.cune.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_21">Description</h3>
+<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if UNE(Unordered or Not Equal), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>
+<h3 id="operation_21">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  if (fp_compare_cune(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_21">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vfcmp_saf_s-__m128-a-__m128-b">__m128i __lsx_vfcmp_saf_s (__m128 a, __m128 b)</h2>
+<h3 id="synopsis_22">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vfcmp_saf_s (__m128 a, __m128 b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfcmp.saf.s vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_22">Description</h3>
+<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if AF(Always False), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>
+<h3 id="operation_22">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (fp_compare_saf(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_22">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vfcmp_saf_d-__m128d-a-__m128d-b">__m128i __lsx_vfcmp_saf_d (__m128d a, __m128d b)</h2>
+<h3 id="synopsis_23">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vfcmp_saf_d (__m128d a, __m128d b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfcmp.saf.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_23">Description</h3>
+<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if AF(Always False), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>
+<h3 id="operation_23">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  if (fp_compare_saf(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_23">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vfcmp_seq_s-__m128-a-__m128-b">__m128i __lsx_vfcmp_seq_s (__m128 a, __m128 b)</h2>
+<h3 id="synopsis_24">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vfcmp_seq_s (__m128 a, __m128 b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfcmp.seq.s vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_24">Description</h3>
+<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if EQ(Equal), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>
+<h3 id="operation_24">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (fp_compare_seq(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_24">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vfcmp_seq_d-__m128d-a-__m128d-b">__m128i __lsx_vfcmp_seq_d (__m128d a, __m128d b)</h2>
+<h3 id="synopsis_25">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vfcmp_seq_d (__m128d a, __m128d b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfcmp.seq.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_25">Description</h3>
+<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if EQ(Equal), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>
+<h3 id="operation_25">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  if (fp_compare_seq(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_25">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vfcmp_sle_s-__m128-a-__m128-b">__m128i __lsx_vfcmp_sle_s (__m128 a, __m128 b)</h2>
+<h3 id="synopsis_26">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vfcmp_sle_s (__m128 a, __m128 b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfcmp.sle.s vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_26">Description</h3>
+<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if LE(Less than or Equal), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>
+<h3 id="operation_26">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (fp_compare_sle(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_26">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vfcmp_sle_d-__m128d-a-__m128d-b">__m128i __lsx_vfcmp_sle_d (__m128d a, __m128d b)</h2>
+<h3 id="synopsis_27">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vfcmp_sle_d (__m128d a, __m128d b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfcmp.sle.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_27">Description</h3>
+<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if LE(Less than or Equal), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>
+<h3 id="operation_27">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  if (fp_compare_sle(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_27">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vfcmp_slt_s-__m128-a-__m128-b">__m128i __lsx_vfcmp_slt_s (__m128 a, __m128 b)</h2>
+<h3 id="synopsis_28">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vfcmp_slt_s (__m128 a, __m128 b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfcmp.slt.s vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_28">Description</h3>
+<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if LT(Less than), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>
+<h3 id="operation_28">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (fp_compare_slt(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_28">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vfcmp_slt_d-__m128d-a-__m128d-b">__m128i __lsx_vfcmp_slt_d (__m128d a, __m128d b)</h2>
+<h3 id="synopsis_29">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vfcmp_slt_d (__m128d a, __m128d b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfcmp.slt.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_29">Description</h3>
+<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if LT(Less than), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>
+<h3 id="operation_29">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  if (fp_compare_slt(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_29">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vfcmp_sne_s-__m128-a-__m128-b">__m128i __lsx_vfcmp_sne_s (__m128 a, __m128 b)</h2>
+<h3 id="synopsis_30">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vfcmp_sne_s (__m128 a, __m128 b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfcmp.sne.s vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_30">Description</h3>
+<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if NE(Not Equal), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>
+<h3 id="operation_30">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (fp_compare_sne(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_30">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vfcmp_sne_d-__m128d-a-__m128d-b">__m128i __lsx_vfcmp_sne_d (__m128d a, __m128d b)</h2>
+<h3 id="synopsis_31">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vfcmp_sne_d (__m128d a, __m128d b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfcmp.sne.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_31">Description</h3>
+<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if NE(Not Equal), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>
+<h3 id="operation_31">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  if (fp_compare_sne(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_31">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vfcmp_sor_s-__m128-a-__m128-b">__m128i __lsx_vfcmp_sor_s (__m128 a, __m128 b)</h2>
+<h3 id="synopsis_32">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vfcmp_sor_s (__m128 a, __m128 b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfcmp.sor.s vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_32">Description</h3>
+<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if OR(Ordered), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>
+<h3 id="operation_32">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (fp_compare_sor(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_32">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vfcmp_sor_d-__m128d-a-__m128d-b">__m128i __lsx_vfcmp_sor_d (__m128d a, __m128d b)</h2>
+<h3 id="synopsis_33">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vfcmp_sor_d (__m128d a, __m128d b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfcmp.sor.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_33">Description</h3>
+<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if OR(Ordered), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>
+<h3 id="operation_33">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  if (fp_compare_sor(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_33">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vfcmp_sueq_s-__m128-a-__m128-b">__m128i __lsx_vfcmp_sueq_s (__m128 a, __m128 b)</h2>
+<h3 id="synopsis_34">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vfcmp_sueq_s (__m128 a, __m128 b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfcmp.sueq.s vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_34">Description</h3>
+<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if UEQ(Unordered or Equal), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>
+<h3 id="operation_34">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (fp_compare_sueq(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_34">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vfcmp_sueq_d-__m128d-a-__m128d-b">__m128i __lsx_vfcmp_sueq_d (__m128d a, __m128d b)</h2>
+<h3 id="synopsis_35">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vfcmp_sueq_d (__m128d a, __m128d b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfcmp.sueq.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_35">Description</h3>
+<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if UEQ(Unordered or Equal), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>
+<h3 id="operation_35">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  if (fp_compare_sueq(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_35">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vfcmp_sule_s-__m128-a-__m128-b">__m128i __lsx_vfcmp_sule_s (__m128 a, __m128 b)</h2>
+<h3 id="synopsis_36">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vfcmp_sule_s (__m128 a, __m128 b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfcmp.sule.s vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_36">Description</h3>
+<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if ULE(Unordered, Less than or Equal), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>
+<h3 id="operation_36">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (fp_compare_sule(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_36">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vfcmp_sule_d-__m128d-a-__m128d-b">__m128i __lsx_vfcmp_sule_d (__m128d a, __m128d b)</h2>
+<h3 id="synopsis_37">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vfcmp_sule_d (__m128d a, __m128d b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfcmp.sule.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_37">Description</h3>
+<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if ULE(Unordered, Less than or Equal), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>
+<h3 id="operation_37">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  if (fp_compare_sule(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_37">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vfcmp_sult_s-__m128-a-__m128-b">__m128i __lsx_vfcmp_sult_s (__m128 a, __m128 b)</h2>
+<h3 id="synopsis_38">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vfcmp_sult_s (__m128 a, __m128 b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfcmp.sult.s vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_38">Description</h3>
+<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if ULT(Unordered or Less than), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>
+<h3 id="operation_38">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (fp_compare_sult(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_38">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vfcmp_sult_d-__m128d-a-__m128d-b">__m128i __lsx_vfcmp_sult_d (__m128d a, __m128d b)</h2>
+<h3 id="synopsis_39">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vfcmp_sult_d (__m128d a, __m128d b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfcmp.sult.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_39">Description</h3>
+<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if ULT(Unordered or Less than), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>
+<h3 id="operation_39">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  if (fp_compare_sult(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_39">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vfcmp_sun_s-__m128-a-__m128-b">__m128i __lsx_vfcmp_sun_s (__m128 a, __m128 b)</h2>
+<h3 id="synopsis_40">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vfcmp_sun_s (__m128 a, __m128 b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfcmp.sun.s vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_40">Description</h3>
+<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if UN(Unordered), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>
+<h3 id="operation_40">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (fp_compare_sun(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_40">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vfcmp_sun_d-__m128d-a-__m128d-b">__m128i __lsx_vfcmp_sun_d (__m128d a, __m128d b)</h2>
+<h3 id="synopsis_41">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vfcmp_sun_d (__m128d a, __m128d b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfcmp.sun.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_41">Description</h3>
+<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if UN(Unordered), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>
+<h3 id="operation_41">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  if (fp_compare_sun(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_41">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vfcmp_sune_s-__m128-a-__m128-b">__m128i __lsx_vfcmp_sune_s (__m128 a, __m128 b)</h2>
+<h3 id="synopsis_42">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vfcmp_sune_s (__m128 a, __m128 b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfcmp.sune.s vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_42">Description</h3>
+<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if UNE(Unordered or Not Equal), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>
+<h3 id="operation_42">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (fp_compare_sune(a.fp32[i], b.fp32[i])) {
+    dst.word[i] = 0xFFFFFFFF;
+  } else {
+    dst.word[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_42">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vfcmp_sune_d-__m128d-a-__m128d-b">__m128i __lsx_vfcmp_sune_d (__m128d a, __m128d b)</h2>
+<h3 id="synopsis_43">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vfcmp_sune_d (__m128d a, __m128d b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfcmp.sune.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_43">Description</h3>
+<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if UNE(Unordered or Not Equal), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>
+<h3 id="operation_43">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  if (fp_compare_sune(a.fp64[i], b.fp64[i])) {
+    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;
+  } else {
+    dst.dword[i] = 0;
+  }
+</code></pre>
+<h3 id="latency-and-throughput_43">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+              
+            </div>
+          </div><footer>
+    <div class="rst-footer-buttons" role="navigation" aria-label="Footer Navigation">
+        <a href="../branch/" class="btn btn-neutral float-left" title="Branch"><span class="icon icon-circle-arrow-left"></span> Previous</a>
+        <a href="../float_computation/" class="btn btn-neutral float-right" title="Floating Point Computation">Next <span class="icon icon-circle-arrow-right"></span></a>
+    </div>
+
+  <hr/>
+
+  <div role="contentinfo">
+    <!-- Copyright etc -->
+      <p>Copyright &copy; 2023 Jiajie Chen</p>
+  </div>
+
+  Built with <a href="https://www.mkdocs.org/">MkDocs</a> using a <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
+</footer>
+          
+        </div>
+      </div>
+
+    </section>
+
+  </div>
+
+  <div class="rst-versions" role="note" aria-label="Versions">
+  <span class="rst-current-version" data-toggle="rst-current-version">
+    
+    
+      <span><a href="../branch/" style="color: #fcfcfc">&laquo; Previous</a></span>
+    
+    
+      <span><a href="../float_computation/" style="color: #fcfcfc">Next &raquo;</a></span>
+    
+  </span>
+</div>
+    <script src="../../js/jquery-3.6.0.min.js"></script>
+    <script>var base_url = "../..";</script>
+    <script src="../../js/theme_extra.js"></script>
+    <script src="../../js/theme.js"></script>
+    <script>
+        jQuery(function () {
+            SphinxRtdTheme.Navigation.enable(true);
+        });
+    </script>
+
+</body>
+</html>
diff --git a/lsx/float_computation/index.html b/lsx/float_computation/index.html
new file mode 100644
index 00000000..a130d12d
--- /dev/null
+++ b/lsx/float_computation/index.html
@@ -0,0 +1,1451 @@
+<!DOCTYPE html>
+<html class="writer-html5" lang="en" >
+<head>
+    <meta charset="utf-8" />
+    <meta http-equiv="X-UA-Compatible" content="IE=edge" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" /><link rel="canonical" href="https://jia.je/unofficial-loongarch-intrinsics-guide/lsx/float_computation/" />
+      <link rel="shortcut icon" href="../../img/favicon.ico" />
+    <title>Floating Point Computation - Unofficial LoongArch Intrinsics Guide</title>
+    <link rel="stylesheet" href="../../css/theme.css" />
+    <link rel="stylesheet" href="../../css/theme_extra.css" />
+        <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.8.0/styles/github.min.css" />
+        <link href="../../main.css" rel="stylesheet" />
+    
+      <script>
+        // Current page data
+        var mkdocs_page_name = "Floating Point Computation";
+        var mkdocs_page_input_path = "lsx/float_computation.md";
+        var mkdocs_page_url = "/unofficial-loongarch-intrinsics-guide/lsx/float_computation/";
+      </script>
+    
+    <!--[if lt IE 9]>
+      <script src="../../js/html5shiv.min.js"></script>
+    <![endif]-->
+      <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.8.0/highlight.min.js"></script>
+      <script>hljs.highlightAll();</script> 
+</head>
+
+<body class="wy-body-for-nav" role="document">
+
+  <div class="wy-grid-for-nav">
+    <nav data-toggle="wy-nav-shift" class="wy-nav-side stickynav">
+    <div class="wy-side-scroll">
+      <div class="wy-side-nav-search">
+          <a href="../.." class="icon icon-home"> Unofficial LoongArch Intrinsics Guide
+        </a>
+      </div>
+
+      <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../..">Unofficial LoongArch Intrinsics Guide</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../latency_throughput/">Latency and Throughput of Instructions</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../migrating_avx/">Migrating from AVX to LASX</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../migrating_sse/">Migrating from SSE to LSX</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../viewer/">Browse All Intrinsics</a>
+                </li>
+              </ul>
+              <p class="caption"><span class="caption-text">Lasx</span></p>
+              <ul>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/bitwise_operations/">Bitwise Operations</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/branch/">Branch</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/float_comparison/">Floating Point Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/float_computation/">Floating Point Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/float_conversion/">Floating Point Conversion</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/float_misc/">Floating Point Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/fma/">Fused Multiply-Add</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/integer_comparison/">Integer Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/integer_computation/">Integer Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/logical/">Logical</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/memory/">Memory Load & Store</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/misc/">Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/permutation/">Permutation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/shift/">Shift</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/shuffling/">Shuffling</a>
+                  </li>
+              </ul>
+              <p class="caption"><span class="caption-text">Lsx</span></p>
+              <ul class="current">
+                  <li class="toctree-l1"><a class="reference internal" href="../bitwise_operations/">Bitwise Operations</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../branch/">Branch</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_comparison/">Floating Point Comparison</a>
+                  </li>
+                  <li class="toctree-l1 current"><a class="reference internal current" href="./">Floating Point Computation</a>
+    <ul class="current">
+    <li class="toctree-l2"><a class="reference internal" href="#__m128-__lsx_vfadd_s-__m128-a-__m128-b">__m128 __lsx_vfadd_s (__m128 a, __m128 b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128d-__lsx_vfadd_d-__m128d-a-__m128d-b">__m128d __lsx_vfadd_d (__m128d a, __m128d b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_1">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_1">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_1">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_1">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128-__lsx_vfdiv_s-__m128-a-__m128-b">__m128 __lsx_vfdiv_s (__m128 a, __m128 b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_2">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_2">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_2">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_2">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128d-__lsx_vfdiv_d-__m128d-a-__m128d-b">__m128d __lsx_vfdiv_d (__m128d a, __m128d b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_3">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_3">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_3">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_3">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128-__lsx_vfmax_s-__m128-a-__m128-b">__m128 __lsx_vfmax_s (__m128 a, __m128 b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_4">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_4">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_4">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_4">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128d-__lsx_vfmax_d-__m128d-a-__m128d-b">__m128d __lsx_vfmax_d (__m128d a, __m128d b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_5">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_5">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_5">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_5">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128-__lsx_vfmaxa_s-__m128-a-__m128-b">__m128 __lsx_vfmaxa_s (__m128 a, __m128 b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_6">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_6">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_6">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_6">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128d-__lsx_vfmaxa_d-__m128d-a-__m128d-b">__m128d __lsx_vfmaxa_d (__m128d a, __m128d b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_7">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_7">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_7">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_7">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128-__lsx_vfmin_s-__m128-a-__m128-b">__m128 __lsx_vfmin_s (__m128 a, __m128 b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_8">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_8">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_8">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_8">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128d-__lsx_vfmin_d-__m128d-a-__m128d-b">__m128d __lsx_vfmin_d (__m128d a, __m128d b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_9">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_9">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_9">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_9">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128-__lsx_vfmina_s-__m128-a-__m128-b">__m128 __lsx_vfmina_s (__m128 a, __m128 b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_10">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_10">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_10">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_10">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128d-__lsx_vfmina_d-__m128d-a-__m128d-b">__m128d __lsx_vfmina_d (__m128d a, __m128d b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_11">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_11">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_11">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_11">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128-__lsx_vfmul_s-__m128-a-__m128-b">__m128 __lsx_vfmul_s (__m128 a, __m128 b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_12">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_12">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_12">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_12">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128d-__lsx_vfmul_d-__m128d-a-__m128d-b">__m128d __lsx_vfmul_d (__m128d a, __m128d b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_13">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_13">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_13">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_13">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128-__lsx_vfsub_s-__m128-a-__m128-b">__m128 __lsx_vfsub_s (__m128 a, __m128 b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_14">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_14">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_14">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_14">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128d-__lsx_vfsub_d-__m128d-a-__m128d-b">__m128d __lsx_vfsub_d (__m128d a, __m128d b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_15">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_15">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_15">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_15">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128-__lsx_vflogb_s-__m128-a">__m128 __lsx_vflogb_s (__m128 a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_16">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_16">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_16">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_16">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128d-__lsx_vflogb_d-__m128d-a">__m128d __lsx_vflogb_d (__m128d a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_17">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_17">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_17">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_17">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128-__lsx_vfsqrt_s-__m128-a">__m128 __lsx_vfsqrt_s (__m128 a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_18">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_18">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_18">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_18">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128d-__lsx_vfsqrt_d-__m128d-a">__m128d __lsx_vfsqrt_d (__m128d a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_19">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_19">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_19">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_19">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128-__lsx_vfrsqrt_s-__m128-a">__m128 __lsx_vfrsqrt_s (__m128 a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_20">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_20">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_20">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_20">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128d-__lsx_vfrsqrt_d-__m128d-a">__m128d __lsx_vfrsqrt_d (__m128d a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_21">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_21">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_21">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_21">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128-__lsx_vfrecip_s-__m128-a">__m128 __lsx_vfrecip_s (__m128 a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_22">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_22">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_22">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_22">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128d-__lsx_vfrecip_d-__m128d-a">__m128d __lsx_vfrecip_d (__m128d a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_23">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_23">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_23">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_23">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128-__lsx_vfrsqrte_s-__m128-a">__m128 __lsx_vfrsqrte_s (__m128 a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_24">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_24">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_24">Operation</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128d-__lsx_vfrsqrte_d-__m128d-a">__m128d __lsx_vfrsqrte_d (__m128d a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_25">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_25">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_25">Operation</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128-__lsx_vfrecipe_s-__m128-a">__m128 __lsx_vfrecipe_s (__m128 a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_26">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_26">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_26">Operation</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128d-__lsx_vfrecipe_d-__m128d-a">__m128d __lsx_vfrecipe_d (__m128d a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_27">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_27">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_27">Operation</a>
+    </li>
+        </ul>
+    </li>
+    </ul>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_conversion/">Floating Point Conversion</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_misc/">Floating Point Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../fma/">Fused Multiply-Add</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../integer_comparison/">Integer Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../integer_computation/">Integer Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../logical/">Logical</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../memory/">Memory Load & Store</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../misc/">Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../permutation/">Permutation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../shift/">Shift</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../shuffling/">Shuffling</a>
+                  </li>
+              </ul>
+      </div>
+    </div>
+    </nav>
+
+    <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
+      <nav class="wy-nav-top" role="navigation" aria-label="Mobile navigation menu">
+          <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
+          <a href="../..">Unofficial LoongArch Intrinsics Guide</a>
+        
+      </nav>
+      <div class="wy-nav-content">
+        <div class="rst-content"><div role="navigation" aria-label="breadcrumbs navigation">
+  <ul class="wy-breadcrumbs">
+    <li><a href="../.." class="icon icon-home" aria-label="Docs"></a></li>
+          <li class="breadcrumb-item">Lsx</li>
+      <li class="breadcrumb-item active">Floating Point Computation</li>
+    <li class="wy-breadcrumbs-aside">
+    </li>
+  </ul>
+  <hr/>
+</div>
+          <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
+            <div class="section" itemprop="articleBody">
+              
+                <h1 id="floating-point-computation">Floating Point Computation</h1>
+<h2 id="__m128-__lsx_vfadd_s-__m128-a-__m128-b">__m128 __lsx_vfadd_s (__m128 a, __m128 b)</h2>
+<h3 id="synopsis">Synopsis</h3>
+<pre><code class="language-c++">__m128 __lsx_vfadd_s (__m128 a, __m128 b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfadd.s vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description">Description</h3>
+<p>Add single precision floating point elements in <code>a</code> to elements in <code>b</code>.</p>
+<h3 id="operation">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.fp32[i] = a.fp32[i] + b.fp32[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>5</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128d-__lsx_vfadd_d-__m128d-a-__m128d-b">__m128d __lsx_vfadd_d (__m128d a, __m128d b)</h2>
+<h3 id="synopsis_1">Synopsis</h3>
+<pre><code class="language-c++">__m128d __lsx_vfadd_d (__m128d a, __m128d b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfadd.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_1">Description</h3>
+<p>Add double precision floating point elements in <code>a</code> to elements in <code>b</code>.</p>
+<h3 id="operation_1">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.fp64[i] = a.fp64[i] + b.fp64[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_1">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>5</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128-__lsx_vfdiv_s-__m128-a-__m128-b">__m128 __lsx_vfdiv_s (__m128 a, __m128 b)</h2>
+<h3 id="synopsis_2">Synopsis</h3>
+<pre><code class="language-c++">__m128 __lsx_vfdiv_s (__m128 a, __m128 b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfdiv.s vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_2">Description</h3>
+<p>Divide single precision floating point elements in <code>a</code> by elements in <code>b</code>.</p>
+<h3 id="operation_2">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.fp32[i] = a.fp32[i] / b.fp32[i];
+}
+</code></pre>
+<h3 id="latency-and-throughput_2">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>11</td>
+<td>0.18(1/5.5)</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>11, 19.5</td>
+<td>0.13(1/7.5)</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128d-__lsx_vfdiv_d-__m128d-a-__m128d-b">__m128d __lsx_vfdiv_d (__m128d a, __m128d b)</h2>
+<h3 id="synopsis_3">Synopsis</h3>
+<pre><code class="language-c++">__m128d __lsx_vfdiv_d (__m128d a, __m128d b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfdiv.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_3">Description</h3>
+<p>Divide double precision floating point elements in <code>a</code> by elements in <code>b</code>.</p>
+<h3 id="operation_3">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.fp64[i] = a.fp64[i] / b.fp64[i];
+}
+</code></pre>
+<h3 id="latency-and-throughput_3">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>8, 21.5</td>
+<td>0.25(1/4)</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>8, 16.5</td>
+<td>0.08(1/12.5)</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128-__lsx_vfmax_s-__m128-a-__m128-b">__m128 __lsx_vfmax_s (__m128 a, __m128 b)</h2>
+<h3 id="synopsis_4">Synopsis</h3>
+<pre><code class="language-c++">__m128 __lsx_vfmax_s (__m128 a, __m128 b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfmax.s vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_4">Description</h3>
+<p>Compute maximum of single precision floating point elements in <code>a</code> and <code>b</code>.</p>
+<h3 id="operation_4">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.fp32[i] = fmax(a.fp32[i], b.fp32[i]);
+}
+</code></pre>
+<h3 id="latency-and-throughput_4">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128d-__lsx_vfmax_d-__m128d-a-__m128d-b">__m128d __lsx_vfmax_d (__m128d a, __m128d b)</h2>
+<h3 id="synopsis_5">Synopsis</h3>
+<pre><code class="language-c++">__m128d __lsx_vfmax_d (__m128d a, __m128d b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfmax.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_5">Description</h3>
+<p>Compute maximum of double precision floating point elements in <code>a</code> and <code>b</code>.</p>
+<h3 id="operation_5">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.fp64[i] = fmax(a.fp64[i], b.fp64[i]);
+}
+</code></pre>
+<h3 id="latency-and-throughput_5">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128-__lsx_vfmaxa_s-__m128-a-__m128-b">__m128 __lsx_vfmaxa_s (__m128 a, __m128 b)</h2>
+<h3 id="synopsis_6">Synopsis</h3>
+<pre><code class="language-c++">__m128 __lsx_vfmaxa_s (__m128 a, __m128 b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfmaxa.s vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_6">Description</h3>
+<p>Compute maximum of single precision floating point elements in <code>a</code> and <code>b</code> by magnitude.</p>
+<h3 id="operation_6">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.fp32[i] = (abs(a.fp32[i]) &gt; abs(b.fp32[i])) ? a.fp32[i] : b.fp32[i];
+}
+</code></pre>
+<h3 id="latency-and-throughput_6">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128d-__lsx_vfmaxa_d-__m128d-a-__m128d-b">__m128d __lsx_vfmaxa_d (__m128d a, __m128d b)</h2>
+<h3 id="synopsis_7">Synopsis</h3>
+<pre><code class="language-c++">__m128d __lsx_vfmaxa_d (__m128d a, __m128d b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfmaxa.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_7">Description</h3>
+<p>Compute maximum of double precision floating point elements in <code>a</code> and <code>b</code> by magnitude.</p>
+<h3 id="operation_7">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.fp64[i] = (abs(a.fp64[i]) &gt; abs(b.fp64[i])) ? a.fp64[i] : b.fp64[i];
+}
+</code></pre>
+<h3 id="latency-and-throughput_7">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128-__lsx_vfmin_s-__m128-a-__m128-b">__m128 __lsx_vfmin_s (__m128 a, __m128 b)</h2>
+<h3 id="synopsis_8">Synopsis</h3>
+<pre><code class="language-c++">__m128 __lsx_vfmin_s (__m128 a, __m128 b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfmax.s vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_8">Description</h3>
+<p>Compute minimum of single precision floating point elements in <code>a</code> and <code>b</code>.</p>
+<h3 id="operation_8">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.fp32[i] = fmin(a.fp32[i], b.fp32[i]);
+}
+</code></pre>
+<h3 id="latency-and-throughput_8">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128d-__lsx_vfmin_d-__m128d-a-__m128d-b">__m128d __lsx_vfmin_d (__m128d a, __m128d b)</h2>
+<h3 id="synopsis_9">Synopsis</h3>
+<pre><code class="language-c++">__m128d __lsx_vfmin_d (__m128d a, __m128d b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfmax.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_9">Description</h3>
+<p>Compute minimum of double precision floating point elements in <code>a</code> and <code>b</code>.</p>
+<h3 id="operation_9">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.fp64[i] = fmin(a.fp64[i], b.fp64[i]);
+}
+</code></pre>
+<h3 id="latency-and-throughput_9">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128-__lsx_vfmina_s-__m128-a-__m128-b">__m128 __lsx_vfmina_s (__m128 a, __m128 b)</h2>
+<h3 id="synopsis_10">Synopsis</h3>
+<pre><code class="language-c++">__m128 __lsx_vfmina_s (__m128 a, __m128 b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfmina.s vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_10">Description</h3>
+<p>Compute minimum of single precision floating point elements in <code>a</code> and <code>b</code> by magnitude.</p>
+<h3 id="operation_10">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.fp32[i] = (abs(a.fp32[i]) &lt; abs(b.fp32[i])) ? a.fp32[i] : b.fp32[i];
+}
+</code></pre>
+<h3 id="latency-and-throughput_10">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128d-__lsx_vfmina_d-__m128d-a-__m128d-b">__m128d __lsx_vfmina_d (__m128d a, __m128d b)</h2>
+<h3 id="synopsis_11">Synopsis</h3>
+<pre><code class="language-c++">__m128d __lsx_vfmina_d (__m128d a, __m128d b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfmina.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_11">Description</h3>
+<p>Compute minimum of double precision floating point elements in <code>a</code> and <code>b</code> by magnitude.</p>
+<h3 id="operation_11">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.fp64[i] = (abs(a.fp64[i]) &lt; abs(b.fp64[i])) ? a.fp64[i] : b.fp64[i];
+}
+</code></pre>
+<h3 id="latency-and-throughput_11">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128-__lsx_vfmul_s-__m128-a-__m128-b">__m128 __lsx_vfmul_s (__m128 a, __m128 b)</h2>
+<h3 id="synopsis_12">Synopsis</h3>
+<pre><code class="language-c++">__m128 __lsx_vfmul_s (__m128 a, __m128 b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfmul.s vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_12">Description</h3>
+<p>Multiply single precision floating point elements in <code>a</code> and elements in <code>b</code>.</p>
+<h3 id="operation_12">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.fp32[i] = a.fp32[i] * b.fp32[i];
+}
+</code></pre>
+<h3 id="latency-and-throughput_12">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>5</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>5</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128d-__lsx_vfmul_d-__m128d-a-__m128d-b">__m128d __lsx_vfmul_d (__m128d a, __m128d b)</h2>
+<h3 id="synopsis_13">Synopsis</h3>
+<pre><code class="language-c++">__m128d __lsx_vfmul_d (__m128d a, __m128d b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfmul.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_13">Description</h3>
+<p>Multiply double precision floating point elements in <code>a</code> and elements in <code>b</code>.</p>
+<h3 id="operation_13">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.fp64[i] = a.fp64[i] * b.fp64[i];
+}
+</code></pre>
+<h3 id="latency-and-throughput_13">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>5</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>5</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128-__lsx_vfsub_s-__m128-a-__m128-b">__m128 __lsx_vfsub_s (__m128 a, __m128 b)</h2>
+<h3 id="synopsis_14">Synopsis</h3>
+<pre><code class="language-c++">__m128 __lsx_vfsub_s (__m128 a, __m128 b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfsub.s vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_14">Description</h3>
+<p>Subtract single precision floating point elements in <code>a</code> by elements in <code>b</code>.</p>
+<h3 id="operation_14">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.fp32[i] = a.fp32[i] - b.fp32[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_14">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>5</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128d-__lsx_vfsub_d-__m128d-a-__m128d-b">__m128d __lsx_vfsub_d (__m128d a, __m128d b)</h2>
+<h3 id="synopsis_15">Synopsis</h3>
+<pre><code class="language-c++">__m128d __lsx_vfsub_d (__m128d a, __m128d b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfsub.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_15">Description</h3>
+<p>Subtract double precision floating point elements in <code>a</code> by elements in <code>b</code>.</p>
+<h3 id="operation_15">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.fp64[i] = a.fp64[i] - b.fp64[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_15">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>5</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128-__lsx_vflogb_s-__m128-a">__m128 __lsx_vflogb_s (__m128 a)</h2>
+<h3 id="synopsis_16">Synopsis</h3>
+<pre><code class="language-c++">__m128 __lsx_vflogb_s (__m128 a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vflogb.s vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_16">Description</h3>
+<p>Compute 2-based logarithm of single precision floating point elements in <code>a</code>.</p>
+<h3 id="operation_16">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.fp32[i] = log2(a.fp32[i]);
+}
+</code></pre>
+<h3 id="latency-and-throughput_16">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128d-__lsx_vflogb_d-__m128d-a">__m128d __lsx_vflogb_d (__m128d a)</h2>
+<h3 id="synopsis_17">Synopsis</h3>
+<pre><code class="language-c++">__m128d __lsx_vflogb_d (__m128d a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vflogb.d vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_17">Description</h3>
+<p>Compute 2-based logarithm of double precision floating point elements in <code>a</code>.</p>
+<h3 id="operation_17">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.fp64[i] = log2(a.fp64[i]);
+}
+</code></pre>
+<h3 id="latency-and-throughput_17">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128-__lsx_vfsqrt_s-__m128-a">__m128 __lsx_vfsqrt_s (__m128 a)</h2>
+<h3 id="synopsis_18">Synopsis</h3>
+<pre><code class="language-c++">__m128 __lsx_vfsqrt_s (__m128 a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfsqrt.s vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_18">Description</h3>
+<p>Compute square root of single precision floating point elements in <code>a</code>.</p>
+<h3 id="operation_18">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.fp32[i] = sqrt(a.fp32[i]);
+}
+</code></pre>
+<h3 id="latency-and-throughput_18">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>11</td>
+<td>0.08(1/12)</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>27</td>
+<td>0.17(1/6)</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128d-__lsx_vfsqrt_d-__m128d-a">__m128d __lsx_vfsqrt_d (__m128d a)</h2>
+<h3 id="synopsis_19">Synopsis</h3>
+<pre><code class="language-c++">__m128d __lsx_vfsqrt_d (__m128d a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfsqrt.d vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_19">Description</h3>
+<p>Compute square root of double precision floating point elements in <code>a</code>.</p>
+<h3 id="operation_19">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.fp64[i] = sqrt(a.fp64[i]);
+}
+</code></pre>
+<h3 id="latency-and-throughput_19">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>36</td>
+<td>0.06(1/17.5)</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>36</td>
+<td>0.05(1/18.5)</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128-__lsx_vfrsqrt_s-__m128-a">__m128 __lsx_vfrsqrt_s (__m128 a)</h2>
+<h3 id="synopsis_20">Synopsis</h3>
+<pre><code class="language-c++">__m128 __lsx_vfrsqrt_s (__m128 a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfrsqrt.s vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_20">Description</h3>
+<p>Compute reciprocal of square root of single precision floating point elements in <code>a</code>.</p>
+<h3 id="operation_20">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.fp32[i] = 1.0 / sqrt(a.fp32[i]);
+}
+</code></pre>
+<h3 id="latency-and-throughput_20">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>17</td>
+<td>0.05(1/19)</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>21</td>
+<td>0.11(1/9)</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128d-__lsx_vfrsqrt_d-__m128d-a">__m128d __lsx_vfrsqrt_d (__m128d a)</h2>
+<h3 id="synopsis_21">Synopsis</h3>
+<pre><code class="language-c++">__m128d __lsx_vfrsqrt_d (__m128d a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfrsqrt.d vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_21">Description</h3>
+<p>Compute reciprocal of square root of double precision floating point elements in <code>a</code>.</p>
+<h3 id="operation_21">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.fp64[i] = 1.0 / sqrt(a.fp64[i]);
+}
+</code></pre>
+<h3 id="latency-and-throughput_21">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>15</td>
+<td>0.04(1/26.5)</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>15</td>
+<td>0.04(1/27.5)</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128-__lsx_vfrecip_s-__m128-a">__m128 __lsx_vfrecip_s (__m128 a)</h2>
+<h3 id="synopsis_22">Synopsis</h3>
+<pre><code class="language-c++">__m128 __lsx_vfrecip_s (__m128 a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfrecip.s vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_22">Description</h3>
+<p>Compute reciprocal of single precision floating point elements in <code>a</code>.</p>
+<h3 id="operation_22">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.fp32[i] = 1 / a.fp32[i];
+}
+</code></pre>
+<h3 id="latency-and-throughput_22">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>11</td>
+<td>0.18(1/5.5)</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>27</td>
+<td>0.14(1/7)</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128d-__lsx_vfrecip_d-__m128d-a">__m128d __lsx_vfrecip_d (__m128d a)</h2>
+<h3 id="synopsis_23">Synopsis</h3>
+<pre><code class="language-c++">__m128d __lsx_vfrecip_d (__m128d a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfrecip.d vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_23">Description</h3>
+<p>Compute reciprocal of double precision floating point elements in <code>a</code>.</p>
+<h3 id="operation_23">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.fp64[i] = 1 / a.fp64[i];
+}
+</code></pre>
+<h3 id="latency-and-throughput_23">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>8</td>
+<td>0.25(1/4)</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>23</td>
+<td>0.08(1/12)</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128-__lsx_vfrsqrte_s-__m128-a">__m128 __lsx_vfrsqrte_s (__m128 a)</h2>
+<h3 id="synopsis_24">Synopsis</h3>
+<pre><code class="language-c++">__m128 __lsx_vfrsqrte_s (__m128 a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfrsqrte.s vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_24">Description</h3>
+<p>Compute estimated reciprocal of square root of single precision floating point elements in <code>a</code>.</p>
+<h3 id="operation_24">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.fp32[i] = 1.0 / sqrt(a.fp32[i]); // estimated
+}
+</code></pre>
+<h2 id="__m128d-__lsx_vfrsqrte_d-__m128d-a">__m128d __lsx_vfrsqrte_d (__m128d a)</h2>
+<h3 id="synopsis_25">Synopsis</h3>
+<pre><code class="language-c++">__m128d __lsx_vfrsqrte_d (__m128d a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfrsqrte.d vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_25">Description</h3>
+<p>Compute estimated reciprocal of square root of double precision floating point elements in <code>a</code>.</p>
+<h3 id="operation_25">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.fp64[i] = 1.0 / sqrt(a.fp64[i]); // estimated
+}
+</code></pre>
+<h2 id="__m128-__lsx_vfrecipe_s-__m128-a">__m128 __lsx_vfrecipe_s (__m128 a)</h2>
+<h3 id="synopsis_26">Synopsis</h3>
+<pre><code class="language-c++">__m128 __lsx_vfrecipe_s (__m128 a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfrecipe.s vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_26">Description</h3>
+<p>Compute estimated reciprocal of single precision floating point elements in <code>a</code>.</p>
+<h3 id="operation_26">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.fp32[i] = 1 / a.fp32[i]; // estimated
+}
+</code></pre>
+<h2 id="__m128d-__lsx_vfrecipe_d-__m128d-a">__m128d __lsx_vfrecipe_d (__m128d a)</h2>
+<h3 id="synopsis_27">Synopsis</h3>
+<pre><code class="language-c++">__m128d __lsx_vfrecipe_d (__m128d a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfrecipe.d vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_27">Description</h3>
+<p>Compute estimated reciprocal of double precision floating point elements in <code>a</code>.</p>
+<h3 id="operation_27">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.fp64[i] = 1 / a.fp64[i]; // estimated
+}
+</code></pre>
+              
+            </div>
+          </div><footer>
+    <div class="rst-footer-buttons" role="navigation" aria-label="Footer Navigation">
+        <a href="../float_comparison/" class="btn btn-neutral float-left" title="Floating Point Comparison"><span class="icon icon-circle-arrow-left"></span> Previous</a>
+        <a href="../float_conversion/" class="btn btn-neutral float-right" title="Floating Point Conversion">Next <span class="icon icon-circle-arrow-right"></span></a>
+    </div>
+
+  <hr/>
+
+  <div role="contentinfo">
+    <!-- Copyright etc -->
+      <p>Copyright &copy; 2023 Jiajie Chen</p>
+  </div>
+
+  Built with <a href="https://www.mkdocs.org/">MkDocs</a> using a <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
+</footer>
+          
+        </div>
+      </div>
+
+    </section>
+
+  </div>
+
+  <div class="rst-versions" role="note" aria-label="Versions">
+  <span class="rst-current-version" data-toggle="rst-current-version">
+    
+    
+      <span><a href="../float_comparison/" style="color: #fcfcfc">&laquo; Previous</a></span>
+    
+    
+      <span><a href="../float_conversion/" style="color: #fcfcfc">Next &raquo;</a></span>
+    
+  </span>
+</div>
+    <script src="../../js/jquery-3.6.0.min.js"></script>
+    <script>var base_url = "../..";</script>
+    <script src="../../js/theme_extra.js"></script>
+    <script src="../../js/theme.js"></script>
+    <script>
+        jQuery(function () {
+            SphinxRtdTheme.Navigation.enable(true);
+        });
+    </script>
+
+</body>
+</html>
diff --git a/lsx/float_conversion/index.html b/lsx/float_conversion/index.html
new file mode 100644
index 00000000..b6f92c39
--- /dev/null
+++ b/lsx/float_conversion/index.html
@@ -0,0 +1,2240 @@
+<!DOCTYPE html>
+<html class="writer-html5" lang="en" >
+<head>
+    <meta charset="utf-8" />
+    <meta http-equiv="X-UA-Compatible" content="IE=edge" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" /><link rel="canonical" href="https://jia.je/unofficial-loongarch-intrinsics-guide/lsx/float_conversion/" />
+      <link rel="shortcut icon" href="../../img/favicon.ico" />
+    <title>Floating Point Conversion - Unofficial LoongArch Intrinsics Guide</title>
+    <link rel="stylesheet" href="../../css/theme.css" />
+    <link rel="stylesheet" href="../../css/theme_extra.css" />
+        <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.8.0/styles/github.min.css" />
+        <link href="../../main.css" rel="stylesheet" />
+    
+      <script>
+        // Current page data
+        var mkdocs_page_name = "Floating Point Conversion";
+        var mkdocs_page_input_path = "lsx/float_conversion.md";
+        var mkdocs_page_url = "/unofficial-loongarch-intrinsics-guide/lsx/float_conversion/";
+      </script>
+    
+    <!--[if lt IE 9]>
+      <script src="../../js/html5shiv.min.js"></script>
+    <![endif]-->
+      <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.8.0/highlight.min.js"></script>
+      <script>hljs.highlightAll();</script> 
+</head>
+
+<body class="wy-body-for-nav" role="document">
+
+  <div class="wy-grid-for-nav">
+    <nav data-toggle="wy-nav-shift" class="wy-nav-side stickynav">
+    <div class="wy-side-scroll">
+      <div class="wy-side-nav-search">
+          <a href="../.." class="icon icon-home"> Unofficial LoongArch Intrinsics Guide
+        </a>
+      </div>
+
+      <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../..">Unofficial LoongArch Intrinsics Guide</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../latency_throughput/">Latency and Throughput of Instructions</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../migrating_avx/">Migrating from AVX to LASX</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../migrating_sse/">Migrating from SSE to LSX</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../viewer/">Browse All Intrinsics</a>
+                </li>
+              </ul>
+              <p class="caption"><span class="caption-text">Lasx</span></p>
+              <ul>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/bitwise_operations/">Bitwise Operations</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/branch/">Branch</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/float_comparison/">Floating Point Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/float_computation/">Floating Point Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/float_conversion/">Floating Point Conversion</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/float_misc/">Floating Point Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/fma/">Fused Multiply-Add</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/integer_comparison/">Integer Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/integer_computation/">Integer Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/logical/">Logical</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/memory/">Memory Load & Store</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/misc/">Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/permutation/">Permutation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/shift/">Shift</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/shuffling/">Shuffling</a>
+                  </li>
+              </ul>
+              <p class="caption"><span class="caption-text">Lsx</span></p>
+              <ul class="current">
+                  <li class="toctree-l1"><a class="reference internal" href="../bitwise_operations/">Bitwise Operations</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../branch/">Branch</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_comparison/">Floating Point Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_computation/">Floating Point Computation</a>
+                  </li>
+                  <li class="toctree-l1 current"><a class="reference internal current" href="./">Floating Point Conversion</a>
+    <ul class="current">
+    <li class="toctree-l2"><a class="reference internal" href="#__m128d-__lsx_vfcvth_d_s-__m128-a">__m128d __lsx_vfcvth_d_s (__m128 a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128d-__lsx_vfcvtl_d_s-__m128-a">__m128d __lsx_vfcvtl_d_s (__m128 a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_1">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_1">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_1">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_1">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128-__lsx_vfcvt_s_d-__m128d-a-__m128d-b">__m128 __lsx_vfcvt_s_d (__m128d a, __m128d b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_2">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_2">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_2">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_2">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128-__lsx_vfcvth_s_h-__m128i-a">__m128 __lsx_vfcvth_s_h (__m128i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_3">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_3">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_3">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_3">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128-__lsx_vfcvtl_s_h-__m128i-a">__m128 __lsx_vfcvtl_s_h (__m128i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_4">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_4">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_4">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_4">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vfcvt_h_s-__m128-a-__m128-b">__m128i __lsx_vfcvt_h_s (__m128 a, __m128 b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_5">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_5">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_5">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_5">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128d-__lsx_vffinth_d_w-__m128i-a">__m128d __lsx_vffinth_d_w (__m128i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_6">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_6">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_6">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_6">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128d-__lsx_vffintl_d_w-__m128i-a">__m128d __lsx_vffintl_d_w (__m128i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_7">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_7">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_7">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_7">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128d-__lsx_vffint_d_l-__m128i-a">__m128d __lsx_vffint_d_l (__m128i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_8">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_8">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_8">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_8">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128d-__lsx_vffint_d_lu-__m128i-a">__m128d __lsx_vffint_d_lu (__m128i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_9">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_9">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_9">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_9">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128-__lsx_vffint_s_w-__m128i-a">__m128 __lsx_vffint_s_w (__m128i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_10">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_10">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_10">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_10">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128-__lsx_vffint_s_wu-__m128i-a">__m128 __lsx_vffint_s_wu (__m128i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_11">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_11">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_11">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_11">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128-__lsx_vffint_s_l-__m128i-a-__m128i-b">__m128 __lsx_vffint_s_l (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_12">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_12">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_12">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_12">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vftintl_l_s-__m128-a">__m128i __lsx_vftintl_l_s (__m128 a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_13">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_13">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_13">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_13">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vftinth_l_s-__m128-a">__m128i __lsx_vftinth_l_s (__m128 a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_14">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_14">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_14">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_14">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vftintrml_l_s-__m128-a">__m128i __lsx_vftintrml_l_s (__m128 a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_15">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_15">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_15">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_15">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vftintrmh_l_s-__m128-a">__m128i __lsx_vftintrmh_l_s (__m128 a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_16">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_16">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_16">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_16">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vftintrpl_l_s-__m128-a">__m128i __lsx_vftintrpl_l_s (__m128 a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_17">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_17">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_17">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_17">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vftintrph_l_s-__m128-a">__m128i __lsx_vftintrph_l_s (__m128 a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_18">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_18">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_18">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_18">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vftintrzl_l_s-__m128-a">__m128i __lsx_vftintrzl_l_s (__m128 a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_19">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_19">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_19">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_19">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vftintrzh_l_s-__m128-a">__m128i __lsx_vftintrzh_l_s (__m128 a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_20">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_20">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_20">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_20">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vftintrnel_l_s-__m128-a">__m128i __lsx_vftintrnel_l_s (__m128 a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_21">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_21">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_21">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_21">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vftintrneh_l_s-__m128-a">__m128i __lsx_vftintrneh_l_s (__m128 a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_22">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_22">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_22">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_22">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vftint_l_d-__m128d-a">__m128i __lsx_vftint_l_d (__m128d a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_23">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_23">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_23">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_23">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vftint_w_s-__m128-a">__m128i __lsx_vftint_w_s (__m128 a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_24">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_24">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_24">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_24">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vftintrm_l_d-__m128d-a">__m128i __lsx_vftintrm_l_d (__m128d a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_25">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_25">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_25">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_25">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vftintrm_w_s-__m128-a">__m128i __lsx_vftintrm_w_s (__m128 a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_26">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_26">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_26">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_26">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vftintrp_l_d-__m128d-a">__m128i __lsx_vftintrp_l_d (__m128d a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_27">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_27">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_27">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_27">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vftintrp_w_s-__m128-a">__m128i __lsx_vftintrp_w_s (__m128 a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_28">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_28">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_28">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_28">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vftintrz_l_d-__m128d-a">__m128i __lsx_vftintrz_l_d (__m128d a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_29">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_29">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_29">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_29">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vftintrz_w_s-__m128-a">__m128i __lsx_vftintrz_w_s (__m128 a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_30">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_30">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_30">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_30">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vftintrne_l_d-__m128d-a">__m128i __lsx_vftintrne_l_d (__m128d a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_31">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_31">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_31">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_31">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vftintrne_w_s-__m128-a">__m128i __lsx_vftintrne_w_s (__m128 a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_32">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_32">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_32">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_32">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vftint_lu_d-__m128d-a">__m128i __lsx_vftint_lu_d (__m128d a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_33">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_33">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_33">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_33">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vftint_wu_s-__m128-a">__m128i __lsx_vftint_wu_s (__m128 a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_34">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_34">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_34">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_34">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vftintrz_lu_d-__m128d-a">__m128i __lsx_vftintrz_lu_d (__m128d a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_35">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_35">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_35">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_35">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vftintrz_wu_s-__m128-a">__m128i __lsx_vftintrz_wu_s (__m128 a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_36">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_36">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_36">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_36">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vftint_w_d-__m128d-a-__m128d-b">__m128i __lsx_vftint_w_d (__m128d a, __m128d b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_37">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_37">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_37">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_37">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vftintrm_w_d-__m128d-a-__m128d-b">__m128i __lsx_vftintrm_w_d (__m128d a, __m128d b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_38">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_38">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_38">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_38">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vftintrp_w_d-__m128d-a-__m128d-b">__m128i __lsx_vftintrp_w_d (__m128d a, __m128d b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_39">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_39">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_39">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_39">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vftintrz_w_d-__m128d-a-__m128d-b">__m128i __lsx_vftintrz_w_d (__m128d a, __m128d b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_40">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_40">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_40">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_40">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vftintrne_w_d-__m128d-a-__m128d-b">__m128i __lsx_vftintrne_w_d (__m128d a, __m128d b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_41">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_41">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_41">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_41">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    </ul>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_misc/">Floating Point Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../fma/">Fused Multiply-Add</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../integer_comparison/">Integer Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../integer_computation/">Integer Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../logical/">Logical</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../memory/">Memory Load & Store</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../misc/">Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../permutation/">Permutation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../shift/">Shift</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../shuffling/">Shuffling</a>
+                  </li>
+              </ul>
+      </div>
+    </div>
+    </nav>
+
+    <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
+      <nav class="wy-nav-top" role="navigation" aria-label="Mobile navigation menu">
+          <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
+          <a href="../..">Unofficial LoongArch Intrinsics Guide</a>
+        
+      </nav>
+      <div class="wy-nav-content">
+        <div class="rst-content"><div role="navigation" aria-label="breadcrumbs navigation">
+  <ul class="wy-breadcrumbs">
+    <li><a href="../.." class="icon icon-home" aria-label="Docs"></a></li>
+          <li class="breadcrumb-item">Lsx</li>
+      <li class="breadcrumb-item active">Floating Point Conversion</li>
+    <li class="wy-breadcrumbs-aside">
+    </li>
+  </ul>
+  <hr/>
+</div>
+          <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
+            <div class="section" itemprop="articleBody">
+              
+                <h1 id="floating-point-conversion">Floating Point Conversion</h1>
+<h2 id="__m128d-__lsx_vfcvth_d_s-__m128-a">__m128d __lsx_vfcvth_d_s (__m128 a)</h2>
+<h3 id="synopsis">Synopsis</h3>
+<pre><code class="language-c++">__m128d __lsx_vfcvth_d_s (__m128 a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfcvth.d.s vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description">Description</h3>
+<p>Convert single precision floating point elements in higher half of <code>a</code> to double precision.</p>
+<h3 id="operation">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.fp64[i] = a.fp32[2 + i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128d-__lsx_vfcvtl_d_s-__m128-a">__m128d __lsx_vfcvtl_d_s (__m128 a)</h2>
+<h3 id="synopsis_1">Synopsis</h3>
+<pre><code class="language-c++">__m128d __lsx_vfcvtl_d_s (__m128 a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfcvtl.d.s vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_1">Description</h3>
+<p>Convert single precision floating point elements in lower half of <code>a</code> to double precision.</p>
+<h3 id="operation_1">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.fp64[i] = a.fp32[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_1">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128-__lsx_vfcvt_s_d-__m128d-a-__m128d-b">__m128 __lsx_vfcvt_s_d (__m128d a, __m128d b)</h2>
+<h3 id="synopsis_2">Synopsis</h3>
+<pre><code class="language-c++">__m128 __lsx_vfcvt_s_d (__m128d a, __m128d b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfcvt.s.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_2">Description</h3>
+<p>Convert double precision floating point elements in <code>a</code> and <code>b</code> to single precision.</p>
+<h3 id="operation_2">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (i &lt; 2) {
+    dst.fp32[i] = b.fp64[i];
+  } else {
+    dst.fp32[i] = a.fp64[i - 2];
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_2">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128-__lsx_vfcvth_s_h-__m128i-a">__m128 __lsx_vfcvth_s_h (__m128i a)</h2>
+<h3 id="synopsis_3">Synopsis</h3>
+<pre><code class="language-c++">__m128 __lsx_vfcvth_s_h (__m128i a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfcvth.s.h vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_3">Description</h3>
+<p>Convert half precision floating point elements in higher half of <code>a</code> to single precision.</p>
+<h3 id="operation_3">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.fp32[i] = a.fp16[4 + i];
+}
+</code></pre>
+<h3 id="latency-and-throughput_3">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128-__lsx_vfcvtl_s_h-__m128i-a">__m128 __lsx_vfcvtl_s_h (__m128i a)</h2>
+<h3 id="synopsis_4">Synopsis</h3>
+<pre><code class="language-c++">__m128 __lsx_vfcvtl_s_h (__m128i a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfcvtl.s.h vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_4">Description</h3>
+<p>Convert half precision floating point elements in lower half of <code>a</code> to single precision.</p>
+<h3 id="operation_4">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.fp32[i] = a.fp16[i];
+}
+</code></pre>
+<h3 id="latency-and-throughput_4">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vfcvt_h_s-__m128-a-__m128-b">__m128i __lsx_vfcvt_h_s (__m128 a, __m128 b)</h2>
+<h3 id="synopsis_5">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vfcvt_h_s (__m128 a, __m128 b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfcvt.h.s vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_5">Description</h3>
+<p>Convert single precision floating point elements in <code>a</code> and <code>b</code> to half precision.</p>
+<h3 id="operation_5">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  if (i &lt; 4) {
+    dst.fp16[i] = b.fp32[i];
+  } else {
+    dst.fp16[i] = a.fp32[i - 4];
+  }
+}
+</code></pre>
+<h3 id="latency-and-throughput_5">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128d-__lsx_vffinth_d_w-__m128i-a">__m128d __lsx_vffinth_d_w (__m128i a)</h2>
+<h3 id="synopsis_6">Synopsis</h3>
+<pre><code class="language-c++">__m128d __lsx_vffinth_d_w (__m128i a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vffinth.d.w vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_6">Description</h3>
+<p>Convert 32-bit integer elements in higher part of <code>a</code> to double precision floating point numbers.</p>
+<h3 id="operation_6">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.fp64[i] = (f64)(s32)a.word[i + 2]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_6">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>5</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>5</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128d-__lsx_vffintl_d_w-__m128i-a">__m128d __lsx_vffintl_d_w (__m128i a)</h2>
+<h3 id="synopsis_7">Synopsis</h3>
+<pre><code class="language-c++">__m128d __lsx_vffintl_d_w (__m128i a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vffintl.d.w vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_7">Description</h3>
+<p>Convert 32-bit integer elements in lower part of <code>a</code> to double precision floating point numbers.</p>
+<h3 id="operation_7">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.fp64[i] = (f64)(s32)a.word[i]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_7">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>5</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>5</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128d-__lsx_vffint_d_l-__m128i-a">__m128d __lsx_vffint_d_l (__m128i a)</h2>
+<h3 id="synopsis_8">Synopsis</h3>
+<pre><code class="language-c++">__m128d __lsx_vffint_d_l (__m128i a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vffint.d.l vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_8">Description</h3>
+<p>Convert signed 64-bit integer elements in <code>a</code> to double-precision floating point numbers.</p>
+<h3 id="operation_8">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.fp64[i] = (f64)(s64)a.dword[i]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_8">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128d-__lsx_vffint_d_lu-__m128i-a">__m128d __lsx_vffint_d_lu (__m128i a)</h2>
+<h3 id="synopsis_9">Synopsis</h3>
+<pre><code class="language-c++">__m128d __lsx_vffint_d_lu (__m128i a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vffint.d.lu vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_9">Description</h3>
+<p>Convert unsigned 64-bit integer elements in <code>a</code> to double-precision floating point numbers.</p>
+<h3 id="operation_9">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.fp64[i] = (f64)(u64)a.dword[i]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_9">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128-__lsx_vffint_s_w-__m128i-a">__m128 __lsx_vffint_s_w (__m128i a)</h2>
+<h3 id="synopsis_10">Synopsis</h3>
+<pre><code class="language-c++">__m128 __lsx_vffint_s_w (__m128i a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vffint.s.w vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_10">Description</h3>
+<p>Convert signed 32-bit integer elements in <code>a</code> to single-precision floating point numbers.</p>
+<h3 id="operation_10">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.fp32[i] = (f32)(s32)a.word[i]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_10">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128-__lsx_vffint_s_wu-__m128i-a">__m128 __lsx_vffint_s_wu (__m128i a)</h2>
+<h3 id="synopsis_11">Synopsis</h3>
+<pre><code class="language-c++">__m128 __lsx_vffint_s_wu (__m128i a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vffint.s.wu vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_11">Description</h3>
+<p>Convert unsigned 32-bit integer elements in <code>a</code> to single-precision floating point numbers.</p>
+<h3 id="operation_11">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.fp32[i] = (f32)(u32)a.word[i]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_11">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128-__lsx_vffint_s_l-__m128i-a-__m128i-b">__m128 __lsx_vffint_s_l (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_12">Synopsis</h3>
+<pre><code class="language-c++">__m128 __lsx_vffint_s_l (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vffint.s.l vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_12">Description</h3>
+<p>Convert 64-bit integer elements in <code>a</code> and <code>b</code> to single-precision floating point numbers.</p>
+<h3 id="operation_12">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.fp32[i] =
+      (i &lt; 2) ? (f32)(s32)a.dword[i]
+              : (f32)(s32)b.dword[i]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_12">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>5</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>5</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vftintl_l_s-__m128-a">__m128i __lsx_vftintl_l_s (__m128 a)</h2>
+<h3 id="synopsis_13">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vftintl_l_s (__m128 a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vftintl.l.s vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_13">Description</h3>
+<p>Convert single-precision floating point elements in lower part of <code>a</code> to 64-bit integer, using current rounding mode specified in <code>fscr</code>.</p>
+<h3 id="operation_13">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_13">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>5</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>5</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vftinth_l_s-__m128-a">__m128i __lsx_vftinth_l_s (__m128 a)</h2>
+<h3 id="synopsis_14">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vftinth_l_s (__m128 a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vftinth.l.s vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_14">Description</h3>
+<p>Convert single-precision floating point elements in higher part of <code>a</code> to 64-bit integer, using current rounding mode specified in <code>fscr</code>.</p>
+<h3 id="operation_14">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = (s32)a.fp32[i + 2]; // rounding mode is not expressed in C
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_14">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>5</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>5</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vftintrml_l_s-__m128-a">__m128i __lsx_vftintrml_l_s (__m128 a)</h2>
+<h3 id="synopsis_15">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vftintrml_l_s (__m128 a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vftintrml.l.s vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_15">Description</h3>
+<p>Convert single-precision floating point elements in lower part of <code>a</code> to 64-bit integer, rounding towards negative infinity.</p>
+<h3 id="operation_15">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_15">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>5</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>5</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vftintrmh_l_s-__m128-a">__m128i __lsx_vftintrmh_l_s (__m128 a)</h2>
+<h3 id="synopsis_16">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vftintrmh_l_s (__m128 a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vftintrmh.l.s vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_16">Description</h3>
+<p>Convert single-precision floating point elements in higher part of <code>a</code> to 64-bit integer, rounding towards negative infinity.</p>
+<h3 id="operation_16">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = (s32)a.fp32[i + 2]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_16">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>5</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>5</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vftintrpl_l_s-__m128-a">__m128i __lsx_vftintrpl_l_s (__m128 a)</h2>
+<h3 id="synopsis_17">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vftintrpl_l_s (__m128 a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vftintrpl.l.s vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_17">Description</h3>
+<p>Convert single-precision floating point elements in lower part of <code>a</code> to 64-bit integer, rounding towards positive infinity.</p>
+<h3 id="operation_17">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_17">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>5</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>5</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vftintrph_l_s-__m128-a">__m128i __lsx_vftintrph_l_s (__m128 a)</h2>
+<h3 id="synopsis_18">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vftintrph_l_s (__m128 a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vftintrph.l.s vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_18">Description</h3>
+<p>Convert single-precision floating point elements in higher part of <code>a</code> to 64-bit integer, rounding towards positive infinity.</p>
+<h3 id="operation_18">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = (s32)a.fp32[i + 2]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_18">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>5</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>5</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vftintrzl_l_s-__m128-a">__m128i __lsx_vftintrzl_l_s (__m128 a)</h2>
+<h3 id="synopsis_19">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vftintrzl_l_s (__m128 a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vftintrzl.l.s vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_19">Description</h3>
+<p>Convert single-precision floating point elements in lower part of <code>a</code> to 64-bit integer, rounding towards zero.</p>
+<h3 id="operation_19">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_19">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>5</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>5</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vftintrzh_l_s-__m128-a">__m128i __lsx_vftintrzh_l_s (__m128 a)</h2>
+<h3 id="synopsis_20">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vftintrzh_l_s (__m128 a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vftintrzh.l.s vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_20">Description</h3>
+<p>Convert single-precision floating point elements in higher part of <code>a</code> to 64-bit integer, rounding towards zero.</p>
+<h3 id="operation_20">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = (s32)a.fp32[i + 2]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_20">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>5</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>5</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vftintrnel_l_s-__m128-a">__m128i __lsx_vftintrnel_l_s (__m128 a)</h2>
+<h3 id="synopsis_21">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vftintrnel_l_s (__m128 a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vftintrnel.l.s vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_21">Description</h3>
+<p>Convert single-precision floating point elements in lower part of <code>a</code> to 64-bit integer, rounding towards nearest even.</p>
+<h3 id="operation_21">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_21">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>5</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>5</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vftintrneh_l_s-__m128-a">__m128i __lsx_vftintrneh_l_s (__m128 a)</h2>
+<h3 id="synopsis_22">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vftintrneh_l_s (__m128 a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vftintrneh.l.s vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_22">Description</h3>
+<p>Convert single-precision floating point elements in higher part of <code>a</code> to 64-bit integer, rounding towards nearest even.</p>
+<h3 id="operation_22">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = (s32)a.fp32[i + 2]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_22">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>5</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>5</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vftint_l_d-__m128d-a">__m128i __lsx_vftint_l_d (__m128d a)</h2>
+<h3 id="synopsis_23">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vftint_l_d (__m128d a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vftint.l.d vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_23">Description</h3>
+<p>Convert double-precision floating point elements in <code>a</code> to signed 64-bit integer, using current rounding mode specified in <code>fscr</code>.</p>
+<h3 id="operation_23">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = (s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_23">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vftint_w_s-__m128-a">__m128i __lsx_vftint_w_s (__m128 a)</h2>
+<h3 id="synopsis_24">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vftint_w_s (__m128 a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vftint.w.s vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_24">Description</h3>
+<p>Convert single-precision floating point elements in <code>a</code> to signed 32-bit integer, using current rounding mode specified in <code>fscr</code>.</p>
+<h3 id="operation_24">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_24">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vftintrm_l_d-__m128d-a">__m128i __lsx_vftintrm_l_d (__m128d a)</h2>
+<h3 id="synopsis_25">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vftintrm_l_d (__m128d a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vftintrm.l.d vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_25">Description</h3>
+<p>Convert double-precision floating point elements in <code>a</code> to signed 64-bit integer, rounding towards negative infinity.</p>
+<h3 id="operation_25">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = (s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_25">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vftintrm_w_s-__m128-a">__m128i __lsx_vftintrm_w_s (__m128 a)</h2>
+<h3 id="synopsis_26">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vftintrm_w_s (__m128 a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vftintrm.w.s vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_26">Description</h3>
+<p>Convert single-precision floating point elements in <code>a</code> to signed 32-bit integer, rounding towards negative infinity.</p>
+<h3 id="operation_26">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_26">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vftintrp_l_d-__m128d-a">__m128i __lsx_vftintrp_l_d (__m128d a)</h2>
+<h3 id="synopsis_27">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vftintrp_l_d (__m128d a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vftintrp.l.d vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_27">Description</h3>
+<p>Convert double-precision floating point elements in <code>a</code> to signed 64-bit integer, rounding towards positive infinity.</p>
+<h3 id="operation_27">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = (s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_27">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vftintrp_w_s-__m128-a">__m128i __lsx_vftintrp_w_s (__m128 a)</h2>
+<h3 id="synopsis_28">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vftintrp_w_s (__m128 a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vftintrp.w.s vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_28">Description</h3>
+<p>Convert single-precision floating point elements in <code>a</code> to signed 32-bit integer, rounding towards positive infinity.</p>
+<h3 id="operation_28">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_28">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vftintrz_l_d-__m128d-a">__m128i __lsx_vftintrz_l_d (__m128d a)</h2>
+<h3 id="synopsis_29">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vftintrz_l_d (__m128d a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vftintrz.l.d vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_29">Description</h3>
+<p>Convert double-precision floating point elements in <code>a</code> to signed 64-bit integer, rounding towards zero.</p>
+<h3 id="operation_29">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = (s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_29">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vftintrz_w_s-__m128-a">__m128i __lsx_vftintrz_w_s (__m128 a)</h2>
+<h3 id="synopsis_30">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vftintrz_w_s (__m128 a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vftintrz.w.s vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_30">Description</h3>
+<p>Convert single-precision floating point elements in <code>a</code> to signed 32-bit integer, rounding towards zero.</p>
+<h3 id="operation_30">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_30">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vftintrne_l_d-__m128d-a">__m128i __lsx_vftintrne_l_d (__m128d a)</h2>
+<h3 id="synopsis_31">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vftintrne_l_d (__m128d a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vftintrne.l.d vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_31">Description</h3>
+<p>Convert double-precision floating point elements in <code>a</code> to signed 64-bit integer, rounding towards nearest even.</p>
+<h3 id="operation_31">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = (s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_31">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vftintrne_w_s-__m128-a">__m128i __lsx_vftintrne_w_s (__m128 a)</h2>
+<h3 id="synopsis_32">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vftintrne_w_s (__m128 a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vftintrne.w.s vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_32">Description</h3>
+<p>Convert single-precision floating point elements in <code>a</code> to signed 32-bit integer, rounding towards nearest even.</p>
+<h3 id="operation_32">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_32">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vftint_lu_d-__m128d-a">__m128i __lsx_vftint_lu_d (__m128d a)</h2>
+<h3 id="synopsis_33">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vftint_lu_d (__m128d a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vftint.lu.d vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_33">Description</h3>
+<p>Convert double-precision floating point elements in <code>a</code> to unsigned 64-bit integer, using current rounding mode specified in <code>fscr</code>.</p>
+<h3 id="operation_33">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = (u64)a.fp64[i]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_33">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vftint_wu_s-__m128-a">__m128i __lsx_vftint_wu_s (__m128 a)</h2>
+<h3 id="synopsis_34">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vftint_wu_s (__m128 a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vftint.wu.s vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_34">Description</h3>
+<p>Convert single-precision floating point elements in <code>a</code> to unsigned 32-bit integer, using current rounding mode specified in <code>fscr</code>.</p>
+<h3 id="operation_34">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = (u32)a.fp32[i]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_34">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vftintrz_lu_d-__m128d-a">__m128i __lsx_vftintrz_lu_d (__m128d a)</h2>
+<h3 id="synopsis_35">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vftintrz_lu_d (__m128d a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vftintrz.lu.d vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_35">Description</h3>
+<p>Convert double-precision floating point elements in <code>a</code> to unsigned 64-bit integer, rounding towards zero.</p>
+<h3 id="operation_35">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = (u64)a.fp64[i]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_35">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vftintrz_wu_s-__m128-a">__m128i __lsx_vftintrz_wu_s (__m128 a)</h2>
+<h3 id="synopsis_36">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vftintrz_wu_s (__m128 a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vftintrz.wu.s vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_36">Description</h3>
+<p>Convert single-precision floating point elements in <code>a</code> to unsigned 32-bit integer, rounding towards zero.</p>
+<h3 id="operation_36">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = (u32)a.fp32[i]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_36">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vftint_w_d-__m128d-a-__m128d-b">__m128i __lsx_vftint_w_d (__m128d a, __m128d b)</h2>
+<h3 id="synopsis_37">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vftint_w_d (__m128d a, __m128d b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vftint.w.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_37">Description</h3>
+<p>Convert double-precision floating point elements in <code>a</code> and <code>b</code> to 32-bit integer, using current rounding mode specified in <code>fscr</code>.</p>
+<h3 id="operation_37">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = (i &lt; 1)
+                     ? (s64)a.fp64[i]
+                     : (s64)b.fp64[i]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_37">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>5</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>5</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vftintrm_w_d-__m128d-a-__m128d-b">__m128i __lsx_vftintrm_w_d (__m128d a, __m128d b)</h2>
+<h3 id="synopsis_38">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vftintrm_w_d (__m128d a, __m128d b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vftintrm.w.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_38">Description</h3>
+<p>Convert double-precision floating point elements in <code>a</code> and <code>b</code> to 32-bit integer, rounding towards negative infinity.</p>
+<h3 id="operation_38">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = (i &lt; 1)
+                     ? (s64)a.fp64[i]
+                     : (s64)b.fp64[i]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_38">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>5</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>5</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vftintrp_w_d-__m128d-a-__m128d-b">__m128i __lsx_vftintrp_w_d (__m128d a, __m128d b)</h2>
+<h3 id="synopsis_39">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vftintrp_w_d (__m128d a, __m128d b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vftintrp.w.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_39">Description</h3>
+<p>Convert double-precision floating point elements in <code>a</code> and <code>b</code> to 32-bit integer, rounding towards positive infinity.</p>
+<h3 id="operation_39">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = (i &lt; 1)
+                     ? (s64)a.fp64[i]
+                     : (s64)b.fp64[i]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_39">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>5</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>5</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vftintrz_w_d-__m128d-a-__m128d-b">__m128i __lsx_vftintrz_w_d (__m128d a, __m128d b)</h2>
+<h3 id="synopsis_40">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vftintrz_w_d (__m128d a, __m128d b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vftintrz.w.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_40">Description</h3>
+<p>Convert double-precision floating point elements in <code>a</code> and <code>b</code> to 32-bit integer, rounding towards zero.</p>
+<h3 id="operation_40">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = (i &lt; 1)
+                     ? (s64)a.fp64[i]
+                     : (s64)b.fp64[i]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_40">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>5</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>5</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vftintrne_w_d-__m128d-a-__m128d-b">__m128i __lsx_vftintrne_w_d (__m128d a, __m128d b)</h2>
+<h3 id="synopsis_41">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vftintrne_w_d (__m128d a, __m128d b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vftintrne.w.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_41">Description</h3>
+<p>Convert double-precision floating point elements in <code>a</code> and <code>b</code> to 32-bit integer, rounding towards nearest even.</p>
+<h3 id="operation_41">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = (i &lt; 1)
+                     ? (s64)a.fp64[i]
+                     : (s64)b.fp64[i]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_41">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>5</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>5</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+              
+            </div>
+          </div><footer>
+    <div class="rst-footer-buttons" role="navigation" aria-label="Footer Navigation">
+        <a href="../float_computation/" class="btn btn-neutral float-left" title="Floating Point Computation"><span class="icon icon-circle-arrow-left"></span> Previous</a>
+        <a href="../float_misc/" class="btn btn-neutral float-right" title="Floating Point Misc">Next <span class="icon icon-circle-arrow-right"></span></a>
+    </div>
+
+  <hr/>
+
+  <div role="contentinfo">
+    <!-- Copyright etc -->
+      <p>Copyright &copy; 2023 Jiajie Chen</p>
+  </div>
+
+  Built with <a href="https://www.mkdocs.org/">MkDocs</a> using a <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
+</footer>
+          
+        </div>
+      </div>
+
+    </section>
+
+  </div>
+
+  <div class="rst-versions" role="note" aria-label="Versions">
+  <span class="rst-current-version" data-toggle="rst-current-version">
+    
+    
+      <span><a href="../float_computation/" style="color: #fcfcfc">&laquo; Previous</a></span>
+    
+    
+      <span><a href="../float_misc/" style="color: #fcfcfc">Next &raquo;</a></span>
+    
+  </span>
+</div>
+    <script src="../../js/jquery-3.6.0.min.js"></script>
+    <script>var base_url = "../..";</script>
+    <script src="../../js/theme_extra.js"></script>
+    <script src="../../js/theme.js"></script>
+    <script>
+        jQuery(function () {
+            SphinxRtdTheme.Navigation.enable(true);
+        });
+    </script>
+
+</body>
+</html>
diff --git a/lsx/float_misc/index.html b/lsx/float_misc/index.html
new file mode 100644
index 00000000..b0fe8122
--- /dev/null
+++ b/lsx/float_misc/index.html
@@ -0,0 +1,775 @@
+<!DOCTYPE html>
+<html class="writer-html5" lang="en" >
+<head>
+    <meta charset="utf-8" />
+    <meta http-equiv="X-UA-Compatible" content="IE=edge" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" /><link rel="canonical" href="https://jia.je/unofficial-loongarch-intrinsics-guide/lsx/float_misc/" />
+      <link rel="shortcut icon" href="../../img/favicon.ico" />
+    <title>Floating Point Misc - Unofficial LoongArch Intrinsics Guide</title>
+    <link rel="stylesheet" href="../../css/theme.css" />
+    <link rel="stylesheet" href="../../css/theme_extra.css" />
+        <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.8.0/styles/github.min.css" />
+        <link href="../../main.css" rel="stylesheet" />
+    
+      <script>
+        // Current page data
+        var mkdocs_page_name = "Floating Point Misc";
+        var mkdocs_page_input_path = "lsx/float_misc.md";
+        var mkdocs_page_url = "/unofficial-loongarch-intrinsics-guide/lsx/float_misc/";
+      </script>
+    
+    <!--[if lt IE 9]>
+      <script src="../../js/html5shiv.min.js"></script>
+    <![endif]-->
+      <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.8.0/highlight.min.js"></script>
+      <script>hljs.highlightAll();</script> 
+</head>
+
+<body class="wy-body-for-nav" role="document">
+
+  <div class="wy-grid-for-nav">
+    <nav data-toggle="wy-nav-shift" class="wy-nav-side stickynav">
+    <div class="wy-side-scroll">
+      <div class="wy-side-nav-search">
+          <a href="../.." class="icon icon-home"> Unofficial LoongArch Intrinsics Guide
+        </a>
+      </div>
+
+      <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../..">Unofficial LoongArch Intrinsics Guide</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../latency_throughput/">Latency and Throughput of Instructions</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../migrating_avx/">Migrating from AVX to LASX</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../migrating_sse/">Migrating from SSE to LSX</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../viewer/">Browse All Intrinsics</a>
+                </li>
+              </ul>
+              <p class="caption"><span class="caption-text">Lasx</span></p>
+              <ul>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/bitwise_operations/">Bitwise Operations</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/branch/">Branch</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/float_comparison/">Floating Point Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/float_computation/">Floating Point Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/float_conversion/">Floating Point Conversion</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/float_misc/">Floating Point Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/fma/">Fused Multiply-Add</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/integer_comparison/">Integer Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/integer_computation/">Integer Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/logical/">Logical</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/memory/">Memory Load & Store</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/misc/">Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/permutation/">Permutation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/shift/">Shift</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/shuffling/">Shuffling</a>
+                  </li>
+              </ul>
+              <p class="caption"><span class="caption-text">Lsx</span></p>
+              <ul class="current">
+                  <li class="toctree-l1"><a class="reference internal" href="../bitwise_operations/">Bitwise Operations</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../branch/">Branch</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_comparison/">Floating Point Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_computation/">Floating Point Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_conversion/">Floating Point Conversion</a>
+                  </li>
+                  <li class="toctree-l1 current"><a class="reference internal current" href="./">Floating Point Misc</a>
+    <ul class="current">
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vfclass_d-__m128d-a">__m128i __lsx_vfclass_d (__m128d a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vfclass_s-__m128-a">__m128i __lsx_vfclass_s (__m128 a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_1">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_1">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_1">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_1">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128-__lsx_vfrint_s-__m128-a">__m128 __lsx_vfrint_s (__m128 a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_2">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_2">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_2">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_2">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128d-__lsx_vfrint_d-__m128d-a">__m128d __lsx_vfrint_d (__m128d a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_3">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_3">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_3">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_3">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128-__lsx_vfrintrp_s-__m128-a">__m128 __lsx_vfrintrp_s (__m128 a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_4">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_4">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_4">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_4">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128d-__lsx_vfrintrp_d-__m128d-a">__m128d __lsx_vfrintrp_d (__m128d a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_5">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_5">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_5">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_5">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128-__lsx_vfrintrm_s-__m128-a">__m128 __lsx_vfrintrm_s (__m128 a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_6">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_6">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_6">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_6">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128d-__lsx_vfrintrm_d-__m128d-a">__m128d __lsx_vfrintrm_d (__m128d a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_7">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_7">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_7">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_7">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128-__lsx_vfrintrz_s-__m128-a">__m128 __lsx_vfrintrz_s (__m128 a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_8">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_8">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_8">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_8">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128d-__lsx_vfrintrz_d-__m128d-a">__m128d __lsx_vfrintrz_d (__m128d a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_9">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_9">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_9">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_9">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128-__lsx_vfrintrne_s-__m128-a">__m128 __lsx_vfrintrne_s (__m128 a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_10">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_10">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_10">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_10">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128d-__lsx_vfrintrne_d-__m128d-a">__m128d __lsx_vfrintrne_d (__m128d a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_11">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_11">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_11">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_11">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    </ul>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../fma/">Fused Multiply-Add</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../integer_comparison/">Integer Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../integer_computation/">Integer Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../logical/">Logical</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../memory/">Memory Load & Store</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../misc/">Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../permutation/">Permutation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../shift/">Shift</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../shuffling/">Shuffling</a>
+                  </li>
+              </ul>
+      </div>
+    </div>
+    </nav>
+
+    <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
+      <nav class="wy-nav-top" role="navigation" aria-label="Mobile navigation menu">
+          <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
+          <a href="../..">Unofficial LoongArch Intrinsics Guide</a>
+        
+      </nav>
+      <div class="wy-nav-content">
+        <div class="rst-content"><div role="navigation" aria-label="breadcrumbs navigation">
+  <ul class="wy-breadcrumbs">
+    <li><a href="../.." class="icon icon-home" aria-label="Docs"></a></li>
+          <li class="breadcrumb-item">Lsx</li>
+      <li class="breadcrumb-item active">Floating Point Misc</li>
+    <li class="wy-breadcrumbs-aside">
+    </li>
+  </ul>
+  <hr/>
+</div>
+          <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
+            <div class="section" itemprop="articleBody">
+              
+                <h1 id="floating-point-misc">Floating Point Misc</h1>
+<h2 id="__m128i-__lsx_vfclass_d-__m128d-a">__m128i __lsx_vfclass_d (__m128d a)</h2>
+<h3 id="synopsis">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vfclass_d (__m128d a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfclass.d vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description">Description</h3>
+<p>Classifiy each double precision floating point elements in <code>a</code>.</p>
+<h3 id="operation">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = fp_classify(a.fp64[i]);
+}
+</code></pre>
+<h3 id="latency-and-throughput">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vfclass_s-__m128-a">__m128i __lsx_vfclass_s (__m128 a)</h2>
+<h3 id="synopsis_1">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vfclass_s (__m128 a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfclass.s vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_1">Description</h3>
+<p>Classifiy each single precision floating point elements in <code>a</code>.</p>
+<h3 id="operation_1">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = fp_classify(a.fp32[i]);
+}
+</code></pre>
+<h3 id="latency-and-throughput_1">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128-__lsx_vfrint_s-__m128-a">__m128 __lsx_vfrint_s (__m128 a)</h2>
+<h3 id="synopsis_2">Synopsis</h3>
+<pre><code class="language-c++">__m128 __lsx_vfrint_s (__m128 a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfrint.s vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_2">Description</h3>
+<p>Round single-precision floating point elements in <code>a</code> to integers, using current rounding mode specified in <code>fscr</code>, and store as floating point numbers.</p>
+<h3 id="operation_2">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.fp32[i] = (fp32)(s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_2">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128d-__lsx_vfrint_d-__m128d-a">__m128d __lsx_vfrint_d (__m128d a)</h2>
+<h3 id="synopsis_3">Synopsis</h3>
+<pre><code class="language-c++">__m128d __lsx_vfrint_d (__m128d a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfrint.d vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_3">Description</h3>
+<p>Round single-precision floating point elements in <code>a</code> to integers, using current rounding mode specified in <code>fscr</code>, and store as floating point numbers.</p>
+<h3 id="operation_3">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.fp64[i] = (fp64)(s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_3">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128-__lsx_vfrintrp_s-__m128-a">__m128 __lsx_vfrintrp_s (__m128 a)</h2>
+<h3 id="synopsis_4">Synopsis</h3>
+<pre><code class="language-c++">__m128 __lsx_vfrintrp_s (__m128 a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfrintrp.s vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_4">Description</h3>
+<p>Round single-precision floating point elements in <code>a</code> to integers, rounding towards positive infinity, and store as floating point numbers.</p>
+<h3 id="operation_4">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.fp32[i] = (fp32)(s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_4">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128d-__lsx_vfrintrp_d-__m128d-a">__m128d __lsx_vfrintrp_d (__m128d a)</h2>
+<h3 id="synopsis_5">Synopsis</h3>
+<pre><code class="language-c++">__m128d __lsx_vfrintrp_d (__m128d a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfrintrp.d vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_5">Description</h3>
+<p>Round single-precision floating point elements in <code>a</code> to integers, rounding towards positive infinity, and store as floating point numbers.</p>
+<h3 id="operation_5">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.fp64[i] = (fp64)(s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_5">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128-__lsx_vfrintrm_s-__m128-a">__m128 __lsx_vfrintrm_s (__m128 a)</h2>
+<h3 id="synopsis_6">Synopsis</h3>
+<pre><code class="language-c++">__m128 __lsx_vfrintrm_s (__m128 a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfrintrm.s vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_6">Description</h3>
+<p>Round single-precision floating point elements in <code>a</code> to integers, rounding towards negative infinity, and store as floating point numbers.</p>
+<h3 id="operation_6">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.fp32[i] = (fp32)(s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_6">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128d-__lsx_vfrintrm_d-__m128d-a">__m128d __lsx_vfrintrm_d (__m128d a)</h2>
+<h3 id="synopsis_7">Synopsis</h3>
+<pre><code class="language-c++">__m128d __lsx_vfrintrm_d (__m128d a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfrintrm.d vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_7">Description</h3>
+<p>Round single-precision floating point elements in <code>a</code> to integers, rounding towards negative infinity, and store as floating point numbers.</p>
+<h3 id="operation_7">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.fp64[i] = (fp64)(s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_7">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128-__lsx_vfrintrz_s-__m128-a">__m128 __lsx_vfrintrz_s (__m128 a)</h2>
+<h3 id="synopsis_8">Synopsis</h3>
+<pre><code class="language-c++">__m128 __lsx_vfrintrz_s (__m128 a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfrintrz.s vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_8">Description</h3>
+<p>Round single-precision floating point elements in <code>a</code> to integers, rounding towards zero, and store as floating point numbers.</p>
+<h3 id="operation_8">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.fp32[i] = (fp32)(s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_8">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128d-__lsx_vfrintrz_d-__m128d-a">__m128d __lsx_vfrintrz_d (__m128d a)</h2>
+<h3 id="synopsis_9">Synopsis</h3>
+<pre><code class="language-c++">__m128d __lsx_vfrintrz_d (__m128d a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfrintrz.d vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_9">Description</h3>
+<p>Round single-precision floating point elements in <code>a</code> to integers, rounding towards zero, and store as floating point numbers.</p>
+<h3 id="operation_9">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.fp64[i] = (fp64)(s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_9">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128-__lsx_vfrintrne_s-__m128-a">__m128 __lsx_vfrintrne_s (__m128 a)</h2>
+<h3 id="synopsis_10">Synopsis</h3>
+<pre><code class="language-c++">__m128 __lsx_vfrintrne_s (__m128 a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfrintrne.s vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_10">Description</h3>
+<p>Round single-precision floating point elements in <code>a</code> to integers, rounding towards nearest even, and store as floating point numbers.</p>
+<h3 id="operation_10">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.fp32[i] = (fp32)(s32)a.fp32[i]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_10">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128d-__lsx_vfrintrne_d-__m128d-a">__m128d __lsx_vfrintrne_d (__m128d a)</h2>
+<h3 id="synopsis_11">Synopsis</h3>
+<pre><code class="language-c++">__m128d __lsx_vfrintrne_d (__m128d a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfrintrne.d vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_11">Description</h3>
+<p>Round single-precision floating point elements in <code>a</code> to integers, rounding towards nearest even, and store as floating point numbers.</p>
+<h3 id="operation_11">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.fp64[i] = (fp64)(s64)a.fp64[i]; // rounding mode is not expressed in C
+}
+</code></pre>
+<h3 id="latency-and-throughput_11">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+              
+            </div>
+          </div><footer>
+    <div class="rst-footer-buttons" role="navigation" aria-label="Footer Navigation">
+        <a href="../float_conversion/" class="btn btn-neutral float-left" title="Floating Point Conversion"><span class="icon icon-circle-arrow-left"></span> Previous</a>
+        <a href="../fma/" class="btn btn-neutral float-right" title="Fused Multiply-Add">Next <span class="icon icon-circle-arrow-right"></span></a>
+    </div>
+
+  <hr/>
+
+  <div role="contentinfo">
+    <!-- Copyright etc -->
+      <p>Copyright &copy; 2023 Jiajie Chen</p>
+  </div>
+
+  Built with <a href="https://www.mkdocs.org/">MkDocs</a> using a <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
+</footer>
+          
+        </div>
+      </div>
+
+    </section>
+
+  </div>
+
+  <div class="rst-versions" role="note" aria-label="Versions">
+  <span class="rst-current-version" data-toggle="rst-current-version">
+    
+    
+      <span><a href="../float_conversion/" style="color: #fcfcfc">&laquo; Previous</a></span>
+    
+    
+      <span><a href="../fma/" style="color: #fcfcfc">Next &raquo;</a></span>
+    
+  </span>
+</div>
+    <script src="../../js/jquery-3.6.0.min.js"></script>
+    <script>var base_url = "../..";</script>
+    <script src="../../js/theme_extra.js"></script>
+    <script src="../../js/theme.js"></script>
+    <script>
+        jQuery(function () {
+            SphinxRtdTheme.Navigation.enable(true);
+        });
+    </script>
+
+</body>
+</html>
diff --git a/lsx/fma/index.html b/lsx/fma/index.html
new file mode 100644
index 00000000..e033d461
--- /dev/null
+++ b/lsx/fma/index.html
@@ -0,0 +1,583 @@
+<!DOCTYPE html>
+<html class="writer-html5" lang="en" >
+<head>
+    <meta charset="utf-8" />
+    <meta http-equiv="X-UA-Compatible" content="IE=edge" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" /><link rel="canonical" href="https://jia.je/unofficial-loongarch-intrinsics-guide/lsx/fma/" />
+      <link rel="shortcut icon" href="../../img/favicon.ico" />
+    <title>Fused Multiply-Add - Unofficial LoongArch Intrinsics Guide</title>
+    <link rel="stylesheet" href="../../css/theme.css" />
+    <link rel="stylesheet" href="../../css/theme_extra.css" />
+        <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.8.0/styles/github.min.css" />
+        <link href="../../main.css" rel="stylesheet" />
+    
+      <script>
+        // Current page data
+        var mkdocs_page_name = "Fused Multiply-Add";
+        var mkdocs_page_input_path = "lsx/fma.md";
+        var mkdocs_page_url = "/unofficial-loongarch-intrinsics-guide/lsx/fma/";
+      </script>
+    
+    <!--[if lt IE 9]>
+      <script src="../../js/html5shiv.min.js"></script>
+    <![endif]-->
+      <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.8.0/highlight.min.js"></script>
+      <script>hljs.highlightAll();</script> 
+</head>
+
+<body class="wy-body-for-nav" role="document">
+
+  <div class="wy-grid-for-nav">
+    <nav data-toggle="wy-nav-shift" class="wy-nav-side stickynav">
+    <div class="wy-side-scroll">
+      <div class="wy-side-nav-search">
+          <a href="../.." class="icon icon-home"> Unofficial LoongArch Intrinsics Guide
+        </a>
+      </div>
+
+      <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../..">Unofficial LoongArch Intrinsics Guide</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../latency_throughput/">Latency and Throughput of Instructions</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../migrating_avx/">Migrating from AVX to LASX</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../migrating_sse/">Migrating from SSE to LSX</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../viewer/">Browse All Intrinsics</a>
+                </li>
+              </ul>
+              <p class="caption"><span class="caption-text">Lasx</span></p>
+              <ul>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/bitwise_operations/">Bitwise Operations</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/branch/">Branch</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/float_comparison/">Floating Point Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/float_computation/">Floating Point Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/float_conversion/">Floating Point Conversion</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/float_misc/">Floating Point Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/fma/">Fused Multiply-Add</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/integer_comparison/">Integer Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/integer_computation/">Integer Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/logical/">Logical</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/memory/">Memory Load & Store</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/misc/">Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/permutation/">Permutation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/shift/">Shift</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/shuffling/">Shuffling</a>
+                  </li>
+              </ul>
+              <p class="caption"><span class="caption-text">Lsx</span></p>
+              <ul class="current">
+                  <li class="toctree-l1"><a class="reference internal" href="../bitwise_operations/">Bitwise Operations</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../branch/">Branch</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_comparison/">Floating Point Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_computation/">Floating Point Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_conversion/">Floating Point Conversion</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_misc/">Floating Point Misc</a>
+                  </li>
+                  <li class="toctree-l1 current"><a class="reference internal current" href="./">Fused Multiply-Add</a>
+    <ul class="current">
+    <li class="toctree-l2"><a class="reference internal" href="#__m128d-__lsx_vfmadd_d-__m128d-a-__m128d-b-__m128d-c">__m128d __lsx_vfmadd_d (__m128d a, __m128d b, __m128d c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128-__lsx_vfmadd_s-__m128-a-__m128-b-__m128-c">__m128 __lsx_vfmadd_s (__m128 a, __m128 b, __m128 c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_1">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_1">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_1">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_1">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128d-__lsx_vfmsub_d-__m128d-a-__m128d-b-__m128d-c">__m128d __lsx_vfmsub_d (__m128d a, __m128d b, __m128d c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_2">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_2">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_2">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_2">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128-__lsx_vfmsub_s-__m128-a-__m128-b-__m128-c">__m128 __lsx_vfmsub_s (__m128 a, __m128 b, __m128 c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_3">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_3">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_3">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_3">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128d-__lsx_vfnmadd_d-__m128d-a-__m128d-b-__m128d-c">__m128d __lsx_vfnmadd_d (__m128d a, __m128d b, __m128d c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_4">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_4">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_4">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_4">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128-__lsx_vfnmadd_s-__m128-a-__m128-b-__m128-c">__m128 __lsx_vfnmadd_s (__m128 a, __m128 b, __m128 c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_5">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_5">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_5">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_5">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128d-__lsx_vfnmsub_d-__m128d-a-__m128d-b-__m128d-c">__m128d __lsx_vfnmsub_d (__m128d a, __m128d b, __m128d c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_6">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_6">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_6">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_6">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128-__lsx_vfnmsub_s-__m128-a-__m128-b-__m128-c">__m128 __lsx_vfnmsub_s (__m128 a, __m128 b, __m128 c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_7">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_7">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_7">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_7">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    </ul>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../integer_comparison/">Integer Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../integer_computation/">Integer Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../logical/">Logical</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../memory/">Memory Load & Store</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../misc/">Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../permutation/">Permutation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../shift/">Shift</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../shuffling/">Shuffling</a>
+                  </li>
+              </ul>
+      </div>
+    </div>
+    </nav>
+
+    <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
+      <nav class="wy-nav-top" role="navigation" aria-label="Mobile navigation menu">
+          <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
+          <a href="../..">Unofficial LoongArch Intrinsics Guide</a>
+        
+      </nav>
+      <div class="wy-nav-content">
+        <div class="rst-content"><div role="navigation" aria-label="breadcrumbs navigation">
+  <ul class="wy-breadcrumbs">
+    <li><a href="../.." class="icon icon-home" aria-label="Docs"></a></li>
+          <li class="breadcrumb-item">Lsx</li>
+      <li class="breadcrumb-item active">Fused Multiply-Add</li>
+    <li class="wy-breadcrumbs-aside">
+    </li>
+  </ul>
+  <hr/>
+</div>
+          <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
+            <div class="section" itemprop="articleBody">
+              
+                <h1 id="fused-multiply-add">Fused Multiply-Add</h1>
+<h2 id="__m128d-__lsx_vfmadd_d-__m128d-a-__m128d-b-__m128d-c">__m128d __lsx_vfmadd_d (__m128d a, __m128d b, __m128d c)</h2>
+<h3 id="synopsis">Synopsis</h3>
+<pre><code class="language-c++">__m128d __lsx_vfmadd_d (__m128d a, __m128d b, __m128d c)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfmadd.d vr, vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description">Description</h3>
+<p>Compute packed double precision floating point FMA(Fused Multiply-Add): multiply elements in <code>a</code> and <code>b</code>, accumulate to elements in <code>c</code> and store the result in <code>dst</code>.</p>
+<h3 id="operation">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.fp64[i] = a.fp64[i] * b.fp64[i] + c.fp64[i];
+}
+</code></pre>
+<h3 id="latency-and-throughput">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>5</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>5</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128-__lsx_vfmadd_s-__m128-a-__m128-b-__m128-c">__m128 __lsx_vfmadd_s (__m128 a, __m128 b, __m128 c)</h2>
+<h3 id="synopsis_1">Synopsis</h3>
+<pre><code class="language-c++">__m128 __lsx_vfmadd_s (__m128 a, __m128 b, __m128 c)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfmadd.s vr, vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_1">Description</h3>
+<p>Compute packed single precision floating point FMA(Fused Multiply-Add): multiply elements in <code>a</code> and <code>b</code>, accumulate to elements in <code>c</code> and store the result in <code>dst</code>.</p>
+<h3 id="operation_1">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.fp32[i] = a.fp32[i] * b.fp32[i] + c.fp32[i];
+}
+</code></pre>
+<h3 id="latency-and-throughput_1">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>5</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>5</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128d-__lsx_vfmsub_d-__m128d-a-__m128d-b-__m128d-c">__m128d __lsx_vfmsub_d (__m128d a, __m128d b, __m128d c)</h2>
+<h3 id="synopsis_2">Synopsis</h3>
+<pre><code class="language-c++">__m128d __lsx_vfmsub_d (__m128d a, __m128d b, __m128d c)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfmsub.d vr, vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_2">Description</h3>
+<p>Compute packed double precision floating point FMA(Fused Multiply-Add): multiply elements in <code>a</code> and <code>b</code>, subtract elements in <code>c</code> and store the result in <code>dst</code>.</p>
+<h3 id="operation_2">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.fp64[i] = a.fp64[i] * b.fp64[i] - c.fp64[i];
+}
+</code></pre>
+<h3 id="latency-and-throughput_2">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>5</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>5</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128-__lsx_vfmsub_s-__m128-a-__m128-b-__m128-c">__m128 __lsx_vfmsub_s (__m128 a, __m128 b, __m128 c)</h2>
+<h3 id="synopsis_3">Synopsis</h3>
+<pre><code class="language-c++">__m128 __lsx_vfmsub_s (__m128 a, __m128 b, __m128 c)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfmsub.s vr, vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_3">Description</h3>
+<p>Compute packed single precision floating point FMA(Fused Multiply-Add): multiply elements in <code>a</code> and <code>b</code>, subtract elements in <code>c</code> and store the result in <code>dst</code>.</p>
+<h3 id="operation_3">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.fp32[i] = a.fp32[i] * b.fp32[i] - c.fp32[i];
+}
+</code></pre>
+<h3 id="latency-and-throughput_3">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>5</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>5</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128d-__lsx_vfnmadd_d-__m128d-a-__m128d-b-__m128d-c">__m128d __lsx_vfnmadd_d (__m128d a, __m128d b, __m128d c)</h2>
+<h3 id="synopsis_4">Synopsis</h3>
+<pre><code class="language-c++">__m128d __lsx_vfnmadd_d (__m128d a, __m128d b, __m128d c)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfnmadd.d vr, vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_4">Description</h3>
+<p>Compute packed double precision floating point FMA(Fused Multiply-Add): multiply elements in <code>a</code> and <code>b</code>, accumulate to elements in <code>c</code> and store the negated result in <code>dst</code>.</p>
+<h3 id="operation_4">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.fp64[i] = (a.fp64[i] * b.fp64[i] + c.fp64[i]);
+}
+</code></pre>
+<h3 id="latency-and-throughput_4">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>5</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>5</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128-__lsx_vfnmadd_s-__m128-a-__m128-b-__m128-c">__m128 __lsx_vfnmadd_s (__m128 a, __m128 b, __m128 c)</h2>
+<h3 id="synopsis_5">Synopsis</h3>
+<pre><code class="language-c++">__m128 __lsx_vfnmadd_s (__m128 a, __m128 b, __m128 c)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfnmadd.s vr, vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_5">Description</h3>
+<p>Compute packed single precision floating point FMA(Fused Multiply-Add): multiply elements in <code>a</code> and <code>b</code>, accumulate to elements in <code>c</code> and store the negated result in <code>dst</code>.</p>
+<h3 id="operation_5">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.fp32[i] = -(a.fp32[i] * b.fp32[i] + c.fp32[i]);
+}
+</code></pre>
+<h3 id="latency-and-throughput_5">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>5</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>5</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128d-__lsx_vfnmsub_d-__m128d-a-__m128d-b-__m128d-c">__m128d __lsx_vfnmsub_d (__m128d a, __m128d b, __m128d c)</h2>
+<h3 id="synopsis_6">Synopsis</h3>
+<pre><code class="language-c++">__m128d __lsx_vfnmsub_d (__m128d a, __m128d b, __m128d c)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfnmsub.d vr, vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_6">Description</h3>
+<p>Compute packed double precision floating point FMA(Fused Multiply-Add): multiply elements in <code>a</code> and <code>b</code>, subtract elements in <code>c</code> and store the negated result in <code>dst</code>.</p>
+<h3 id="operation_6">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.fp64[i] = -(a.fp64[i] * b.fp64[i] - c.fp64[i]);
+}
+</code></pre>
+<h3 id="latency-and-throughput_6">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>5</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>5</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128-__lsx_vfnmsub_s-__m128-a-__m128-b-__m128-c">__m128 __lsx_vfnmsub_s (__m128 a, __m128 b, __m128 c)</h2>
+<h3 id="synopsis_7">Synopsis</h3>
+<pre><code class="language-c++">__m128 __lsx_vfnmsub_s (__m128 a, __m128 b, __m128 c)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfnmsub.s vr, vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_7">Description</h3>
+<p>Compute packed single precision floating point FMA(Fused Multiply-Add): multiply elements in <code>a</code> and <code>b</code>, subtract elements in <code>c</code> and store the negated result in <code>dst</code>.</p>
+<h3 id="operation_7">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.fp32[i] = -(a.fp32[i] * b.fp32[i] - c.fp32[i]);
+}
+</code></pre>
+<h3 id="latency-and-throughput_7">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>5</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>5</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+              
+            </div>
+          </div><footer>
+    <div class="rst-footer-buttons" role="navigation" aria-label="Footer Navigation">
+        <a href="../float_misc/" class="btn btn-neutral float-left" title="Floating Point Misc"><span class="icon icon-circle-arrow-left"></span> Previous</a>
+        <a href="../integer_comparison/" class="btn btn-neutral float-right" title="Integer Comparison">Next <span class="icon icon-circle-arrow-right"></span></a>
+    </div>
+
+  <hr/>
+
+  <div role="contentinfo">
+    <!-- Copyright etc -->
+      <p>Copyright &copy; 2023 Jiajie Chen</p>
+  </div>
+
+  Built with <a href="https://www.mkdocs.org/">MkDocs</a> using a <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
+</footer>
+          
+        </div>
+      </div>
+
+    </section>
+
+  </div>
+
+  <div class="rst-versions" role="note" aria-label="Versions">
+  <span class="rst-current-version" data-toggle="rst-current-version">
+    
+    
+      <span><a href="../float_misc/" style="color: #fcfcfc">&laquo; Previous</a></span>
+    
+    
+      <span><a href="../integer_comparison/" style="color: #fcfcfc">Next &raquo;</a></span>
+    
+  </span>
+</div>
+    <script src="../../js/jquery-3.6.0.min.js"></script>
+    <script>var base_url = "../..";</script>
+    <script src="../../js/theme_extra.js"></script>
+    <script src="../../js/theme.js"></script>
+    <script>
+        jQuery(function () {
+            SphinxRtdTheme.Navigation.enable(true);
+        });
+    </script>
+
+</body>
+</html>
diff --git a/lsx/integer_comparison/index.html b/lsx/integer_comparison/index.html
new file mode 100644
index 00000000..0d136b62
--- /dev/null
+++ b/lsx/integer_comparison/index.html
@@ -0,0 +1,2159 @@
+<!DOCTYPE html>
+<html class="writer-html5" lang="en" >
+<head>
+    <meta charset="utf-8" />
+    <meta http-equiv="X-UA-Compatible" content="IE=edge" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" /><link rel="canonical" href="https://jia.je/unofficial-loongarch-intrinsics-guide/lsx/integer_comparison/" />
+      <link rel="shortcut icon" href="../../img/favicon.ico" />
+    <title>Integer Comparison - Unofficial LoongArch Intrinsics Guide</title>
+    <link rel="stylesheet" href="../../css/theme.css" />
+    <link rel="stylesheet" href="../../css/theme_extra.css" />
+        <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.8.0/styles/github.min.css" />
+        <link href="../../main.css" rel="stylesheet" />
+    
+      <script>
+        // Current page data
+        var mkdocs_page_name = "Integer Comparison";
+        var mkdocs_page_input_path = "lsx/integer_comparison.md";
+        var mkdocs_page_url = "/unofficial-loongarch-intrinsics-guide/lsx/integer_comparison/";
+      </script>
+    
+    <!--[if lt IE 9]>
+      <script src="../../js/html5shiv.min.js"></script>
+    <![endif]-->
+      <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.8.0/highlight.min.js"></script>
+      <script>hljs.highlightAll();</script> 
+</head>
+
+<body class="wy-body-for-nav" role="document">
+
+  <div class="wy-grid-for-nav">
+    <nav data-toggle="wy-nav-shift" class="wy-nav-side stickynav">
+    <div class="wy-side-scroll">
+      <div class="wy-side-nav-search">
+          <a href="../.." class="icon icon-home"> Unofficial LoongArch Intrinsics Guide
+        </a>
+      </div>
+
+      <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../..">Unofficial LoongArch Intrinsics Guide</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../latency_throughput/">Latency and Throughput of Instructions</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../migrating_avx/">Migrating from AVX to LASX</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../migrating_sse/">Migrating from SSE to LSX</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../viewer/">Browse All Intrinsics</a>
+                </li>
+              </ul>
+              <p class="caption"><span class="caption-text">Lasx</span></p>
+              <ul>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/bitwise_operations/">Bitwise Operations</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/branch/">Branch</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/float_comparison/">Floating Point Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/float_computation/">Floating Point Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/float_conversion/">Floating Point Conversion</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/float_misc/">Floating Point Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/fma/">Fused Multiply-Add</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/integer_comparison/">Integer Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/integer_computation/">Integer Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/logical/">Logical</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/memory/">Memory Load & Store</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/misc/">Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/permutation/">Permutation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/shift/">Shift</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/shuffling/">Shuffling</a>
+                  </li>
+              </ul>
+              <p class="caption"><span class="caption-text">Lsx</span></p>
+              <ul class="current">
+                  <li class="toctree-l1"><a class="reference internal" href="../bitwise_operations/">Bitwise Operations</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../branch/">Branch</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_comparison/">Floating Point Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_computation/">Floating Point Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_conversion/">Floating Point Conversion</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_misc/">Floating Point Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../fma/">Fused Multiply-Add</a>
+                  </li>
+                  <li class="toctree-l1 current"><a class="reference internal current" href="./">Integer Comparison</a>
+    <ul class="current">
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vseq_b-__m128i-a-__m128i-b">__m128i __lsx_vseq_b (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vseq_h-__m128i-a-__m128i-b">__m128i __lsx_vseq_h (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_1">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_1">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_1">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_1">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vseq_w-__m128i-a-__m128i-b">__m128i __lsx_vseq_w (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_2">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_2">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_2">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_2">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vseq_d-__m128i-a-__m128i-b">__m128i __lsx_vseq_d (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_3">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_3">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_3">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_3">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vseqi_b-__m128i-a-imm_n16_15-imm">__m128i __lsx_vseqi_b (__m128i a, imm_n16_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_4">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_4">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_4">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_4">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vseqi_h-__m128i-a-imm_n16_15-imm">__m128i __lsx_vseqi_h (__m128i a, imm_n16_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_5">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_5">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_5">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_5">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vseqi_w-__m128i-a-imm_n16_15-imm">__m128i __lsx_vseqi_w (__m128i a, imm_n16_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_6">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_6">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_6">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_6">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vseqi_d-__m128i-a-imm_n16_15-imm">__m128i __lsx_vseqi_d (__m128i a, imm_n16_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_7">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_7">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_7">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_7">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vslt_b-__m128i-a-__m128i-b">__m128i __lsx_vslt_b (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_8">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_8">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_8">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_8">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vslt_bu-__m128i-a-__m128i-b">__m128i __lsx_vslt_bu (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_9">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_9">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_9">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_9">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vslt_h-__m128i-a-__m128i-b">__m128i __lsx_vslt_h (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_10">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_10">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_10">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_10">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vslt_hu-__m128i-a-__m128i-b">__m128i __lsx_vslt_hu (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_11">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_11">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_11">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_11">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vslt_w-__m128i-a-__m128i-b">__m128i __lsx_vslt_w (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_12">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_12">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_12">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_12">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vslt_wu-__m128i-a-__m128i-b">__m128i __lsx_vslt_wu (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_13">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_13">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_13">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_13">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vslt_d-__m128i-a-__m128i-b">__m128i __lsx_vslt_d (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_14">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_14">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_14">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_14">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vslt_du-__m128i-a-__m128i-b">__m128i __lsx_vslt_du (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_15">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_15">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_15">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_15">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vslti_b-__m128i-a-imm_n16_15-imm">__m128i __lsx_vslti_b (__m128i a, imm_n16_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_16">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_16">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_16">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_16">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vslti_bu-__m128i-a-imm0_31-imm">__m128i __lsx_vslti_bu (__m128i a, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_17">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_17">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_17">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_17">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vslti_h-__m128i-a-imm_n16_15-imm">__m128i __lsx_vslti_h (__m128i a, imm_n16_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_18">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_18">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_18">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_18">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vslti_hu-__m128i-a-imm0_31-imm">__m128i __lsx_vslti_hu (__m128i a, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_19">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_19">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_19">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_19">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vslti_w-__m128i-a-imm_n16_15-imm">__m128i __lsx_vslti_w (__m128i a, imm_n16_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_20">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_20">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_20">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_20">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vslti_wu-__m128i-a-imm0_31-imm">__m128i __lsx_vslti_wu (__m128i a, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_21">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_21">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_21">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_21">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vslti_d-__m128i-a-imm_n16_15-imm">__m128i __lsx_vslti_d (__m128i a, imm_n16_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_22">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_22">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_22">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_22">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vslti_du-__m128i-a-imm0_31-imm">__m128i __lsx_vslti_du (__m128i a, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_23">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_23">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_23">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_23">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsle_b-__m128i-a-__m128i-b">__m128i __lsx_vsle_b (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_24">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_24">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_24">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_24">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsle_bu-__m128i-a-__m128i-b">__m128i __lsx_vsle_bu (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_25">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_25">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_25">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_25">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsle_h-__m128i-a-__m128i-b">__m128i __lsx_vsle_h (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_26">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_26">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_26">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_26">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsle_hu-__m128i-a-__m128i-b">__m128i __lsx_vsle_hu (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_27">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_27">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_27">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_27">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsle_w-__m128i-a-__m128i-b">__m128i __lsx_vsle_w (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_28">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_28">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_28">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_28">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsle_wu-__m128i-a-__m128i-b">__m128i __lsx_vsle_wu (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_29">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_29">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_29">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_29">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsle_d-__m128i-a-__m128i-b">__m128i __lsx_vsle_d (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_30">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_30">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_30">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_30">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsle_du-__m128i-a-__m128i-b">__m128i __lsx_vsle_du (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_31">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_31">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_31">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_31">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vslei_b-__m128i-a-imm_n16_15-imm">__m128i __lsx_vslei_b (__m128i a, imm_n16_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_32">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_32">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_32">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_32">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vslei_bu-__m128i-a-imm0_31-imm">__m128i __lsx_vslei_bu (__m128i a, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_33">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_33">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_33">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_33">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vslei_h-__m128i-a-imm_n16_15-imm">__m128i __lsx_vslei_h (__m128i a, imm_n16_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_34">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_34">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_34">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_34">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vslei_hu-__m128i-a-imm0_31-imm">__m128i __lsx_vslei_hu (__m128i a, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_35">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_35">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_35">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_35">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vslei_w-__m128i-a-imm_n16_15-imm">__m128i __lsx_vslei_w (__m128i a, imm_n16_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_36">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_36">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_36">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_36">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vslei_wu-__m128i-a-imm0_31-imm">__m128i __lsx_vslei_wu (__m128i a, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_37">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_37">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_37">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_37">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vslei_d-__m128i-a-imm_n16_15-imm">__m128i __lsx_vslei_d (__m128i a, imm_n16_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_38">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_38">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_38">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_38">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vslei_du-__m128i-a-imm0_31-imm">__m128i __lsx_vslei_du (__m128i a, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_39">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_39">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_39">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_39">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    </ul>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../integer_computation/">Integer Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../logical/">Logical</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../memory/">Memory Load & Store</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../misc/">Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../permutation/">Permutation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../shift/">Shift</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../shuffling/">Shuffling</a>
+                  </li>
+              </ul>
+      </div>
+    </div>
+    </nav>
+
+    <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
+      <nav class="wy-nav-top" role="navigation" aria-label="Mobile navigation menu">
+          <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
+          <a href="../..">Unofficial LoongArch Intrinsics Guide</a>
+        
+      </nav>
+      <div class="wy-nav-content">
+        <div class="rst-content"><div role="navigation" aria-label="breadcrumbs navigation">
+  <ul class="wy-breadcrumbs">
+    <li><a href="../.." class="icon icon-home" aria-label="Docs"></a></li>
+          <li class="breadcrumb-item">Lsx</li>
+      <li class="breadcrumb-item active">Integer Comparison</li>
+    <li class="wy-breadcrumbs-aside">
+    </li>
+  </ul>
+  <hr/>
+</div>
+          <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
+            <div class="section" itemprop="articleBody">
+              
+                <h1 id="integer-comparison">Integer Comparison</h1>
+<h2 id="__m128i-__lsx_vseq_b-__m128i-a-__m128i-b">__m128i __lsx_vseq_b (__m128i a, __m128i b)</h2>
+<h3 id="synopsis">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vseq_b (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vseq.b vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description">Description</h3>
+<p>Compare the 8-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if equal, zero otherwise.</p>
+<h3 id="operation">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = (a.byte[i] == b.byte[i]) ? 0xFF : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vseq_h-__m128i-a-__m128i-b">__m128i __lsx_vseq_h (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_1">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vseq_h (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vseq.h vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_1">Description</h3>
+<p>Compare the 16-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if equal, zero otherwise.</p>
+<h3 id="operation_1">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = (a.half[i] == b.half[i]) ? 0xFFFF : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_1">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vseq_w-__m128i-a-__m128i-b">__m128i __lsx_vseq_w (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_2">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vseq_w (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vseq.w vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_2">Description</h3>
+<p>Compare the 32-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if equal, zero otherwise.</p>
+<h3 id="operation_2">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = (a.word[i] == b.word[i]) ? 0xFFFFFFFF : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_2">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vseq_d-__m128i-a-__m128i-b">__m128i __lsx_vseq_d (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_3">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vseq_d (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vseq.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_3">Description</h3>
+<p>Compare the 64-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if equal, zero otherwise.</p>
+<h3 id="operation_3">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = (a.dword[i] == b.dword[i]) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_3">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vseqi_b-__m128i-a-imm_n16_15-imm">__m128i __lsx_vseqi_b (__m128i a, imm_n16_15 imm)</h2>
+<h3 id="synopsis_4">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vseqi_b (__m128i a, imm_n16_15 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vseqi.b vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_4">Description</h3>
+<p>Compare the 8-bit elements in <code>a</code> and <code>imm</code>, store all-ones to <code>dst</code> if equal, zero otherwise.</p>
+<h3 id="operation_4">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = ((s8)a.byte[i] == imm) ? 0xFF : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_4">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vseqi_h-__m128i-a-imm_n16_15-imm">__m128i __lsx_vseqi_h (__m128i a, imm_n16_15 imm)</h2>
+<h3 id="synopsis_5">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vseqi_h (__m128i a, imm_n16_15 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vseqi.h vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_5">Description</h3>
+<p>Compare the 16-bit elements in <code>a</code> and <code>imm</code>, store all-ones to <code>dst</code> if equal, zero otherwise.</p>
+<h3 id="operation_5">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = ((s16)a.half[i] == imm) ? 0xFFFF : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_5">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vseqi_w-__m128i-a-imm_n16_15-imm">__m128i __lsx_vseqi_w (__m128i a, imm_n16_15 imm)</h2>
+<h3 id="synopsis_6">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vseqi_w (__m128i a, imm_n16_15 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vseqi.w vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_6">Description</h3>
+<p>Compare the 32-bit elements in <code>a</code> and <code>imm</code>, store all-ones to <code>dst</code> if equal, zero otherwise.</p>
+<h3 id="operation_6">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = ((s32)a.word[i] == imm) ? 0xFFFFFFFF : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_6">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vseqi_d-__m128i-a-imm_n16_15-imm">__m128i __lsx_vseqi_d (__m128i a, imm_n16_15 imm)</h2>
+<h3 id="synopsis_7">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vseqi_d (__m128i a, imm_n16_15 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vseqi.d vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_7">Description</h3>
+<p>Compare the 64-bit elements in <code>a</code> and <code>imm</code>, store all-ones to <code>dst</code> if equal, zero otherwise.</p>
+<h3 id="operation_7">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = ((s64)a.dword[i] == imm) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_7">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vslt_b-__m128i-a-__m128i-b">__m128i __lsx_vslt_b (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_8">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vslt_b (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vslt.b vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_8">Description</h3>
+<p>Compare the signed 8-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than <code>b</code>, zero otherwise.</p>
+<h3 id="operation_8">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = ((s8)a.byte[i] &lt; (s8)b.byte[i]) ? 0xFF : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_8">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vslt_bu-__m128i-a-__m128i-b">__m128i __lsx_vslt_bu (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_9">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vslt_bu (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vslt.bu vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_9">Description</h3>
+<p>Compare the unsigned 8-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than <code>b</code>, zero otherwise.</p>
+<h3 id="operation_9">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = ((u8)a.byte[i] &lt; (u8)b.byte[i]) ? 0xFF : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_9">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vslt_h-__m128i-a-__m128i-b">__m128i __lsx_vslt_h (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_10">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vslt_h (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vslt.h vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_10">Description</h3>
+<p>Compare the signed 16-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than <code>b</code>, zero otherwise.</p>
+<h3 id="operation_10">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = ((s16)a.half[i] &lt; (s16)b.half[i]) ? 0xFFFF : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_10">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vslt_hu-__m128i-a-__m128i-b">__m128i __lsx_vslt_hu (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_11">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vslt_hu (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vslt.hu vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_11">Description</h3>
+<p>Compare the unsigned 16-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than <code>b</code>, zero otherwise.</p>
+<h3 id="operation_11">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = ((u16)a.half[i] &lt; (u16)b.half[i]) ? 0xFFFF : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_11">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vslt_w-__m128i-a-__m128i-b">__m128i __lsx_vslt_w (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_12">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vslt_w (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vslt.w vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_12">Description</h3>
+<p>Compare the signed 32-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than <code>b</code>, zero otherwise.</p>
+<h3 id="operation_12">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = ((s32)a.word[i] &lt; (s32)b.word[i]) ? 0xFFFFFFFF : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_12">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vslt_wu-__m128i-a-__m128i-b">__m128i __lsx_vslt_wu (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_13">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vslt_wu (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vslt.wu vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_13">Description</h3>
+<p>Compare the unsigned 32-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than <code>b</code>, zero otherwise.</p>
+<h3 id="operation_13">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = ((u32)a.word[i] &lt; (u32)b.word[i]) ? 0xFFFFFFFF : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_13">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vslt_d-__m128i-a-__m128i-b">__m128i __lsx_vslt_d (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_14">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vslt_d (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vslt.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_14">Description</h3>
+<p>Compare the signed 64-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than <code>b</code>, zero otherwise.</p>
+<h3 id="operation_14">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = ((s64)a.dword[i] &lt; (s64)b.dword[i]) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_14">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vslt_du-__m128i-a-__m128i-b">__m128i __lsx_vslt_du (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_15">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vslt_du (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vslt.du vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_15">Description</h3>
+<p>Compare the unsigned 64-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than <code>b</code>, zero otherwise.</p>
+<h3 id="operation_15">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = ((u64)a.dword[i] &lt; (u64)b.dword[i]) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_15">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vslti_b-__m128i-a-imm_n16_15-imm">__m128i __lsx_vslti_b (__m128i a, imm_n16_15 imm)</h2>
+<h3 id="synopsis_16">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vslti_b (__m128i a, imm_n16_15 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vslti.b vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_16">Description</h3>
+<p>Compare the signed 8-bit elements in <code>a</code> and <code>imm</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than <code>b</code>, zero otherwise.</p>
+<h3 id="operation_16">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = ((s8)a.byte[i] &lt; imm) ? 0xFF : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_16">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vslti_bu-__m128i-a-imm0_31-imm">__m128i __lsx_vslti_bu (__m128i a, imm0_31 imm)</h2>
+<h3 id="synopsis_17">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vslti_bu (__m128i a, imm0_31 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vslti.bu vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_17">Description</h3>
+<p>Compare the unsigned 8-bit elements in <code>a</code> and <code>imm</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than <code>b</code>, zero otherwise.</p>
+<h3 id="operation_17">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = ((u8)a.byte[i] &lt; imm) ? 0xFF : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_17">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vslti_h-__m128i-a-imm_n16_15-imm">__m128i __lsx_vslti_h (__m128i a, imm_n16_15 imm)</h2>
+<h3 id="synopsis_18">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vslti_h (__m128i a, imm_n16_15 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vslti.h vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_18">Description</h3>
+<p>Compare the signed 16-bit elements in <code>a</code> and <code>imm</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than <code>b</code>, zero otherwise.</p>
+<h3 id="operation_18">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = ((s16)a.half[i] &lt; imm) ? 0xFFFF : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_18">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vslti_hu-__m128i-a-imm0_31-imm">__m128i __lsx_vslti_hu (__m128i a, imm0_31 imm)</h2>
+<h3 id="synopsis_19">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vslti_hu (__m128i a, imm0_31 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vslti.hu vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_19">Description</h3>
+<p>Compare the unsigned 16-bit elements in <code>a</code> and <code>imm</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than <code>b</code>, zero otherwise.</p>
+<h3 id="operation_19">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = ((u16)a.half[i] &lt; imm) ? 0xFFFF : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_19">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vslti_w-__m128i-a-imm_n16_15-imm">__m128i __lsx_vslti_w (__m128i a, imm_n16_15 imm)</h2>
+<h3 id="synopsis_20">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vslti_w (__m128i a, imm_n16_15 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vslti.w vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_20">Description</h3>
+<p>Compare the signed 32-bit elements in <code>a</code> and <code>imm</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than <code>b</code>, zero otherwise.</p>
+<h3 id="operation_20">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = ((s32)a.word[i] &lt; imm) ? 0xFFFFFFFF : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_20">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vslti_wu-__m128i-a-imm0_31-imm">__m128i __lsx_vslti_wu (__m128i a, imm0_31 imm)</h2>
+<h3 id="synopsis_21">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vslti_wu (__m128i a, imm0_31 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vslti.wu vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_21">Description</h3>
+<p>Compare the unsigned 32-bit elements in <code>a</code> and <code>imm</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than <code>b</code>, zero otherwise.</p>
+<h3 id="operation_21">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = ((u32)a.word[i] &lt; imm) ? 0xFFFFFFFF : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_21">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vslti_d-__m128i-a-imm_n16_15-imm">__m128i __lsx_vslti_d (__m128i a, imm_n16_15 imm)</h2>
+<h3 id="synopsis_22">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vslti_d (__m128i a, imm_n16_15 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vslti.d vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_22">Description</h3>
+<p>Compare the signed 64-bit elements in <code>a</code> and <code>imm</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than <code>b</code>, zero otherwise.</p>
+<h3 id="operation_22">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = ((s64)a.dword[i] &lt; imm) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_22">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vslti_du-__m128i-a-imm0_31-imm">__m128i __lsx_vslti_du (__m128i a, imm0_31 imm)</h2>
+<h3 id="synopsis_23">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vslti_du (__m128i a, imm0_31 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vslti.du vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_23">Description</h3>
+<p>Compare the unsigned 64-bit elements in <code>a</code> and <code>imm</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than <code>b</code>, zero otherwise.</p>
+<h3 id="operation_23">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = ((u64)a.dword[i] &lt; imm) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_23">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsle_b-__m128i-a-__m128i-b">__m128i __lsx_vsle_b (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_24">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsle_b (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsle.b vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_24">Description</h3>
+<p>Compare the signed 8-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than or equal <code>b</code>, zero otherwise.</p>
+<h3 id="operation_24">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = ((s8)a.byte[i] &lt;= (s8)b.byte[i]) ? 0xFF : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_24">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsle_bu-__m128i-a-__m128i-b">__m128i __lsx_vsle_bu (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_25">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsle_bu (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsle.bu vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_25">Description</h3>
+<p>Compare the unsigned 8-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than or equal <code>b</code>, zero otherwise.</p>
+<h3 id="operation_25">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = ((u8)a.byte[i] &lt;= (u8)b.byte[i]) ? 0xFF : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_25">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsle_h-__m128i-a-__m128i-b">__m128i __lsx_vsle_h (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_26">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsle_h (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsle.h vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_26">Description</h3>
+<p>Compare the signed 16-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than or equal <code>b</code>, zero otherwise.</p>
+<h3 id="operation_26">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = ((s16)a.half[i] &lt;= (s16)b.half[i]) ? 0xFFFF : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_26">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsle_hu-__m128i-a-__m128i-b">__m128i __lsx_vsle_hu (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_27">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsle_hu (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsle.hu vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_27">Description</h3>
+<p>Compare the unsigned 16-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than or equal <code>b</code>, zero otherwise.</p>
+<h3 id="operation_27">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = ((u16)a.half[i] &lt;= (u16)b.half[i]) ? 0xFFFF : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_27">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsle_w-__m128i-a-__m128i-b">__m128i __lsx_vsle_w (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_28">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsle_w (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsle.w vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_28">Description</h3>
+<p>Compare the signed 32-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than or equal <code>b</code>, zero otherwise.</p>
+<h3 id="operation_28">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = ((s32)a.word[i] &lt;= (s32)b.word[i]) ? 0xFFFFFFFF : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_28">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsle_wu-__m128i-a-__m128i-b">__m128i __lsx_vsle_wu (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_29">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsle_wu (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsle.wu vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_29">Description</h3>
+<p>Compare the unsigned 32-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than or equal <code>b</code>, zero otherwise.</p>
+<h3 id="operation_29">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = ((u32)a.word[i] &lt;= (u32)b.word[i]) ? 0xFFFFFFFF : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_29">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsle_d-__m128i-a-__m128i-b">__m128i __lsx_vsle_d (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_30">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsle_d (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsle.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_30">Description</h3>
+<p>Compare the signed 64-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than or equal <code>b</code>, zero otherwise.</p>
+<h3 id="operation_30">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = ((s64)a.dword[i] &lt;= (s64)b.dword[i]) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_30">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsle_du-__m128i-a-__m128i-b">__m128i __lsx_vsle_du (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_31">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsle_du (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsle.du vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_31">Description</h3>
+<p>Compare the unsigned 64-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than or equal <code>b</code>, zero otherwise.</p>
+<h3 id="operation_31">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = ((u64)a.dword[i] &lt;= (u64)b.dword[i]) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_31">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vslei_b-__m128i-a-imm_n16_15-imm">__m128i __lsx_vslei_b (__m128i a, imm_n16_15 imm)</h2>
+<h3 id="synopsis_32">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vslei_b (__m128i a, imm_n16_15 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vslei.b vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_32">Description</h3>
+<p>Compare the signed 8-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than or equal <code>b</code>, zero otherwise.</p>
+<h3 id="operation_32">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = ((s8)a.byte[i] &lt;= imm) ? 0xFF : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_32">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vslei_bu-__m128i-a-imm0_31-imm">__m128i __lsx_vslei_bu (__m128i a, imm0_31 imm)</h2>
+<h3 id="synopsis_33">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vslei_bu (__m128i a, imm0_31 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vslei.bu vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_33">Description</h3>
+<p>Compare the unsigned 8-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than or equal <code>b</code>, zero otherwise.</p>
+<h3 id="operation_33">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = ((u8)a.byte[i] &lt;= imm) ? 0xFF : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_33">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vslei_h-__m128i-a-imm_n16_15-imm">__m128i __lsx_vslei_h (__m128i a, imm_n16_15 imm)</h2>
+<h3 id="synopsis_34">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vslei_h (__m128i a, imm_n16_15 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vslei.h vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_34">Description</h3>
+<p>Compare the signed 16-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than or equal <code>b</code>, zero otherwise.</p>
+<h3 id="operation_34">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = ((s16)a.half[i] &lt;= imm) ? 0xFFFF : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_34">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vslei_hu-__m128i-a-imm0_31-imm">__m128i __lsx_vslei_hu (__m128i a, imm0_31 imm)</h2>
+<h3 id="synopsis_35">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vslei_hu (__m128i a, imm0_31 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vslei.hu vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_35">Description</h3>
+<p>Compare the unsigned 16-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than or equal <code>b</code>, zero otherwise.</p>
+<h3 id="operation_35">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = ((u16)a.half[i] &lt;= imm) ? 0xFFFF : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_35">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vslei_w-__m128i-a-imm_n16_15-imm">__m128i __lsx_vslei_w (__m128i a, imm_n16_15 imm)</h2>
+<h3 id="synopsis_36">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vslei_w (__m128i a, imm_n16_15 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vslei.w vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_36">Description</h3>
+<p>Compare the signed 32-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than or equal <code>b</code>, zero otherwise.</p>
+<h3 id="operation_36">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = ((s32)a.word[i] &lt;= imm) ? 0xFFFFFFFF : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_36">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vslei_wu-__m128i-a-imm0_31-imm">__m128i __lsx_vslei_wu (__m128i a, imm0_31 imm)</h2>
+<h3 id="synopsis_37">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vslei_wu (__m128i a, imm0_31 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vslei.wu vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_37">Description</h3>
+<p>Compare the unsigned 32-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than or equal <code>b</code>, zero otherwise.</p>
+<h3 id="operation_37">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = ((u32)a.word[i] &lt;= imm) ? 0xFFFFFFFF : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_37">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vslei_d-__m128i-a-imm_n16_15-imm">__m128i __lsx_vslei_d (__m128i a, imm_n16_15 imm)</h2>
+<h3 id="synopsis_38">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vslei_d (__m128i a, imm_n16_15 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vslei.d vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_38">Description</h3>
+<p>Compare the signed 64-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than or equal <code>b</code>, zero otherwise.</p>
+<h3 id="operation_38">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = ((s64)a.dword[i] &lt;= imm) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_38">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vslei_du-__m128i-a-imm0_31-imm">__m128i __lsx_vslei_du (__m128i a, imm0_31 imm)</h2>
+<h3 id="synopsis_39">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vslei_du (__m128i a, imm0_31 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vslei.du vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_39">Description</h3>
+<p>Compare the unsigned 64-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than or equal <code>b</code>, zero otherwise.</p>
+<h3 id="operation_39">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = ((u64)a.dword[i] &lt;= imm) ? 0xFFFFFFFFFFFFFFFF : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_39">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+              
+            </div>
+          </div><footer>
+    <div class="rst-footer-buttons" role="navigation" aria-label="Footer Navigation">
+        <a href="../fma/" class="btn btn-neutral float-left" title="Fused Multiply-Add"><span class="icon icon-circle-arrow-left"></span> Previous</a>
+        <a href="../integer_computation/" class="btn btn-neutral float-right" title="Integer Computation">Next <span class="icon icon-circle-arrow-right"></span></a>
+    </div>
+
+  <hr/>
+
+  <div role="contentinfo">
+    <!-- Copyright etc -->
+      <p>Copyright &copy; 2023 Jiajie Chen</p>
+  </div>
+
+  Built with <a href="https://www.mkdocs.org/">MkDocs</a> using a <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
+</footer>
+          
+        </div>
+      </div>
+
+    </section>
+
+  </div>
+
+  <div class="rst-versions" role="note" aria-label="Versions">
+  <span class="rst-current-version" data-toggle="rst-current-version">
+    
+    
+      <span><a href="../fma/" style="color: #fcfcfc">&laquo; Previous</a></span>
+    
+    
+      <span><a href="../integer_computation/" style="color: #fcfcfc">Next &raquo;</a></span>
+    
+  </span>
+</div>
+    <script src="../../js/jquery-3.6.0.min.js"></script>
+    <script>var base_url = "../..";</script>
+    <script src="../../js/theme_extra.js"></script>
+    <script src="../../js/theme.js"></script>
+    <script>
+        jQuery(function () {
+            SphinxRtdTheme.Navigation.enable(true);
+        });
+    </script>
+
+</body>
+</html>
diff --git a/lsx/integer_computation/index.html b/lsx/integer_computation/index.html
new file mode 100644
index 00000000..eb2b459a
--- /dev/null
+++ b/lsx/integer_computation/index.html
@@ -0,0 +1,11907 @@
+<!DOCTYPE html>
+<html class="writer-html5" lang="en" >
+<head>
+    <meta charset="utf-8" />
+    <meta http-equiv="X-UA-Compatible" content="IE=edge" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" /><link rel="canonical" href="https://jia.je/unofficial-loongarch-intrinsics-guide/lsx/integer_computation/" />
+      <link rel="shortcut icon" href="../../img/favicon.ico" />
+    <title>Integer Computation - Unofficial LoongArch Intrinsics Guide</title>
+    <link rel="stylesheet" href="../../css/theme.css" />
+    <link rel="stylesheet" href="../../css/theme_extra.css" />
+        <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.8.0/styles/github.min.css" />
+        <link href="../../main.css" rel="stylesheet" />
+    
+      <script>
+        // Current page data
+        var mkdocs_page_name = "Integer Computation";
+        var mkdocs_page_input_path = "lsx/integer_computation.md";
+        var mkdocs_page_url = "/unofficial-loongarch-intrinsics-guide/lsx/integer_computation/";
+      </script>
+    
+    <!--[if lt IE 9]>
+      <script src="../../js/html5shiv.min.js"></script>
+    <![endif]-->
+      <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.8.0/highlight.min.js"></script>
+      <script>hljs.highlightAll();</script> 
+</head>
+
+<body class="wy-body-for-nav" role="document">
+
+  <div class="wy-grid-for-nav">
+    <nav data-toggle="wy-nav-shift" class="wy-nav-side stickynav">
+    <div class="wy-side-scroll">
+      <div class="wy-side-nav-search">
+          <a href="../.." class="icon icon-home"> Unofficial LoongArch Intrinsics Guide
+        </a>
+      </div>
+
+      <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../..">Unofficial LoongArch Intrinsics Guide</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../latency_throughput/">Latency and Throughput of Instructions</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../migrating_avx/">Migrating from AVX to LASX</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../migrating_sse/">Migrating from SSE to LSX</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../viewer/">Browse All Intrinsics</a>
+                </li>
+              </ul>
+              <p class="caption"><span class="caption-text">Lasx</span></p>
+              <ul>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/bitwise_operations/">Bitwise Operations</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/branch/">Branch</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/float_comparison/">Floating Point Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/float_computation/">Floating Point Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/float_conversion/">Floating Point Conversion</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/float_misc/">Floating Point Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/fma/">Fused Multiply-Add</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/integer_comparison/">Integer Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/integer_computation/">Integer Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/logical/">Logical</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/memory/">Memory Load & Store</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/misc/">Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/permutation/">Permutation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/shift/">Shift</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/shuffling/">Shuffling</a>
+                  </li>
+              </ul>
+              <p class="caption"><span class="caption-text">Lsx</span></p>
+              <ul class="current">
+                  <li class="toctree-l1"><a class="reference internal" href="../bitwise_operations/">Bitwise Operations</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../branch/">Branch</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_comparison/">Floating Point Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_computation/">Floating Point Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_conversion/">Floating Point Conversion</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_misc/">Floating Point Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../fma/">Fused Multiply-Add</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../integer_comparison/">Integer Comparison</a>
+                  </li>
+                  <li class="toctree-l1 current"><a class="reference internal current" href="./">Integer Computation</a>
+    <ul class="current">
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vadd_b-__m128i-a-__m128i-b">__m128i __lsx_vadd_b (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vadd_h-__m128i-a-__m128i-b">__m128i __lsx_vadd_h (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_1">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_1">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_1">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_1">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vadd_w-__m128i-a-__m128i-b">__m128i __lsx_vadd_w (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_2">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_2">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_2">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_2">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vadd_d-__m128i-a-__m128i-b">__m128i __lsx_vadd_d (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_3">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_3">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_3">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_3">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vadd_q-__m128i-a-__m128i-b">__m128i __lsx_vadd_q (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_4">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_4">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_4">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_4">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vabsd_b-__m128i-a-__m128i-b">__m128i __lsx_vabsd_b (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_5">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_5">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_5">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_5">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vabsd_bu-__m128i-a-__m128i-b">__m128i __lsx_vabsd_bu (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_6">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_6">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_6">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_6">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vabsd_h-__m128i-a-__m128i-b">__m128i __lsx_vabsd_h (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_7">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_7">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_7">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_7">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vabsd_hu-__m128i-a-__m128i-b">__m128i __lsx_vabsd_hu (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_8">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_8">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_8">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_8">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vabsd_w-__m128i-a-__m128i-b">__m128i __lsx_vabsd_w (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_9">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_9">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_9">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_9">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vabsd_wu-__m128i-a-__m128i-b">__m128i __lsx_vabsd_wu (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_10">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_10">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_10">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_10">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vabsd_d-__m128i-a-__m128i-b">__m128i __lsx_vabsd_d (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_11">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_11">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_11">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_11">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vabsd_du-__m128i-a-__m128i-b">__m128i __lsx_vabsd_du (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_12">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_12">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_12">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_12">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vadda_b-__m128i-a-__m128i-b">__m128i __lsx_vadda_b (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_13">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_13">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_13">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_13">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vadda_h-__m128i-a-__m128i-b">__m128i __lsx_vadda_h (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_14">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_14">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_14">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_14">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vadda_w-__m128i-a-__m128i-b">__m128i __lsx_vadda_w (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_15">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_15">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_15">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_15">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vadda_d-__m128i-a-__m128i-b">__m128i __lsx_vadda_d (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_16">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_16">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_16">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_16">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vaddi_bu-__m128i-a-imm0_31-imm">__m128i __lsx_vaddi_bu (__m128i a, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_17">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_17">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_17">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_17">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vaddi_hu-__m128i-a-imm0_31-imm">__m128i __lsx_vaddi_hu (__m128i a, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_18">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_18">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_18">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_18">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vaddi_wu-__m128i-a-imm0_31-imm">__m128i __lsx_vaddi_wu (__m128i a, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_19">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_19">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_19">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_19">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vaddi_du-__m128i-a-imm0_31-imm">__m128i __lsx_vaddi_du (__m128i a, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_20">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_20">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_20">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_20">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vaddwev_h_b-__m128i-a-__m128i-b">__m128i __lsx_vaddwev_h_b (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_21">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_21">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_21">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_21">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vaddwev_h_bu-__m128i-a-__m128i-b">__m128i __lsx_vaddwev_h_bu (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_22">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_22">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_22">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_22">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vaddwev_h_bu_b-__m128i-a-__m128i-b">__m128i __lsx_vaddwev_h_bu_b (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_23">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_23">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_23">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_23">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vaddwev_w_h-__m128i-a-__m128i-b">__m128i __lsx_vaddwev_w_h (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_24">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_24">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_24">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_24">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vaddwev_w_hu-__m128i-a-__m128i-b">__m128i __lsx_vaddwev_w_hu (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_25">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_25">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_25">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_25">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vaddwev_w_hu_h-__m128i-a-__m128i-b">__m128i __lsx_vaddwev_w_hu_h (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_26">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_26">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_26">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_26">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vaddwev_d_w-__m128i-a-__m128i-b">__m128i __lsx_vaddwev_d_w (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_27">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_27">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_27">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_27">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vaddwev_d_wu-__m128i-a-__m128i-b">__m128i __lsx_vaddwev_d_wu (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_28">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_28">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_28">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_28">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vaddwev_d_wu_w-__m128i-a-__m128i-b">__m128i __lsx_vaddwev_d_wu_w (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_29">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_29">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_29">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_29">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vaddwev_q_d-__m128i-a-__m128i-b">__m128i __lsx_vaddwev_q_d (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_30">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_30">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_30">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_30">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vaddwev_q_du-__m128i-a-__m128i-b">__m128i __lsx_vaddwev_q_du (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_31">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_31">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_31">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_31">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vaddwev_q_du_d-__m128i-a-__m128i-b">__m128i __lsx_vaddwev_q_du_d (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_32">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_32">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_32">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_32">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vaddwod_h_b-__m128i-a-__m128i-b">__m128i __lsx_vaddwod_h_b (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_33">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_33">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_33">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_33">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vaddwod_h_bu-__m128i-a-__m128i-b">__m128i __lsx_vaddwod_h_bu (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_34">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_34">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_34">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_34">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vaddwod_h_bu_b-__m128i-a-__m128i-b">__m128i __lsx_vaddwod_h_bu_b (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_35">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_35">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_35">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_35">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vaddwod_w_h-__m128i-a-__m128i-b">__m128i __lsx_vaddwod_w_h (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_36">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_36">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_36">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_36">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vaddwod_w_hu-__m128i-a-__m128i-b">__m128i __lsx_vaddwod_w_hu (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_37">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_37">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_37">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_37">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vaddwod_w_hu_h-__m128i-a-__m128i-b">__m128i __lsx_vaddwod_w_hu_h (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_38">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_38">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_38">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_38">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vaddwod_d_w-__m128i-a-__m128i-b">__m128i __lsx_vaddwod_d_w (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_39">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_39">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_39">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_39">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vaddwod_d_wu-__m128i-a-__m128i-b">__m128i __lsx_vaddwod_d_wu (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_40">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_40">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_40">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_40">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vaddwod_d_wu_w-__m128i-a-__m128i-b">__m128i __lsx_vaddwod_d_wu_w (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_41">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_41">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_41">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_41">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vaddwod_q_d-__m128i-a-__m128i-b">__m128i __lsx_vaddwod_q_d (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_42">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_42">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_42">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_42">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vaddwod_q_du-__m128i-a-__m128i-b">__m128i __lsx_vaddwod_q_du (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_43">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_43">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_43">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_43">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vaddwod_q_du_d-__m128i-a-__m128i-b">__m128i __lsx_vaddwod_q_du_d (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_44">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_44">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_44">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_44">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vavg_b-__m128i-a-__m128i-b">__m128i __lsx_vavg_b (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_45">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_45">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_45">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_45">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vavg_bu-__m128i-a-__m128i-b">__m128i __lsx_vavg_bu (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_46">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_46">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_46">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_46">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vavg_h-__m128i-a-__m128i-b">__m128i __lsx_vavg_h (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_47">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_47">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_47">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_47">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vavg_hu-__m128i-a-__m128i-b">__m128i __lsx_vavg_hu (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_48">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_48">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_48">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_48">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vavg_w-__m128i-a-__m128i-b">__m128i __lsx_vavg_w (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_49">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_49">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_49">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_49">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vavg_wu-__m128i-a-__m128i-b">__m128i __lsx_vavg_wu (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_50">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_50">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_50">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_50">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vavg_d-__m128i-a-__m128i-b">__m128i __lsx_vavg_d (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_51">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_51">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_51">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_51">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vavg_du-__m128i-a-__m128i-b">__m128i __lsx_vavg_du (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_52">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_52">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_52">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_52">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vavgr_b-__m128i-a-__m128i-b">__m128i __lsx_vavgr_b (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_53">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_53">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_53">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_53">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vavgr_bu-__m128i-a-__m128i-b">__m128i __lsx_vavgr_bu (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_54">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_54">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_54">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_54">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vavgr_h-__m128i-a-__m128i-b">__m128i __lsx_vavgr_h (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_55">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_55">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_55">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_55">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vavgr_hu-__m128i-a-__m128i-b">__m128i __lsx_vavgr_hu (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_56">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_56">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_56">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_56">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vavgr_w-__m128i-a-__m128i-b">__m128i __lsx_vavgr_w (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_57">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_57">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_57">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_57">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vavgr_wu-__m128i-a-__m128i-b">__m128i __lsx_vavgr_wu (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_58">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_58">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_58">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_58">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vavgr_d-__m128i-a-__m128i-b">__m128i __lsx_vavgr_d (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_59">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_59">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_59">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_59">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vavgr_du-__m128i-a-__m128i-b">__m128i __lsx_vavgr_du (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_60">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_60">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_60">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_60">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vdiv_b-__m128i-a-__m128i-b">__m128i __lsx_vdiv_b (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_61">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_61">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_61">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_61">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vdiv_bu-__m128i-a-__m128i-b">__m128i __lsx_vdiv_bu (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_62">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_62">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_62">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_62">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vdiv_h-__m128i-a-__m128i-b">__m128i __lsx_vdiv_h (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_63">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_63">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_63">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_63">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vdiv_hu-__m128i-a-__m128i-b">__m128i __lsx_vdiv_hu (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_64">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_64">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_64">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_64">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vdiv_w-__m128i-a-__m128i-b">__m128i __lsx_vdiv_w (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_65">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_65">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_65">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_65">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vdiv_wu-__m128i-a-__m128i-b">__m128i __lsx_vdiv_wu (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_66">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_66">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_66">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_66">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vdiv_d-__m128i-a-__m128i-b">__m128i __lsx_vdiv_d (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_67">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_67">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_67">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_67">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vdiv_du-__m128i-a-__m128i-b">__m128i __lsx_vdiv_du (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_68">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_68">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_68">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_68">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vhaddw_h_b-__m128i-a-__m128i-b">__m128i __lsx_vhaddw_h_b (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_69">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_69">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_69">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_69">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vhaddw_hu_bu-__m128i-a-__m128i-b">__m128i __lsx_vhaddw_hu_bu (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_70">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_70">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_70">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_70">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vhaddw_w_h-__m128i-a-__m128i-b">__m128i __lsx_vhaddw_w_h (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_71">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_71">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_71">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_71">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vhaddw_wu_hu-__m128i-a-__m128i-b">__m128i __lsx_vhaddw_wu_hu (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_72">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_72">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_72">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_72">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vhaddw_d_w-__m128i-a-__m128i-b">__m128i __lsx_vhaddw_d_w (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_73">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_73">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_73">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_73">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vhaddw_du_wu-__m128i-a-__m128i-b">__m128i __lsx_vhaddw_du_wu (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_74">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_74">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_74">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_74">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vhaddw_q_d-__m128i-a-__m128i-b">__m128i __lsx_vhaddw_q_d (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_75">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_75">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_75">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_75">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vhaddw_qu_du-__m128i-a-__m128i-b">__m128i __lsx_vhaddw_qu_du (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_76">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_76">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_76">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_76">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vhsubw_h_b-__m128i-a-__m128i-b">__m128i __lsx_vhsubw_h_b (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_77">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_77">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_77">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_77">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vhsubw_hu_bu-__m128i-a-__m128i-b">__m128i __lsx_vhsubw_hu_bu (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_78">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_78">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_78">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_78">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vhsubw_w_h-__m128i-a-__m128i-b">__m128i __lsx_vhsubw_w_h (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_79">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_79">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_79">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_79">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vhsubw_wu_hu-__m128i-a-__m128i-b">__m128i __lsx_vhsubw_wu_hu (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_80">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_80">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_80">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_80">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vhsubw_d_w-__m128i-a-__m128i-b">__m128i __lsx_vhsubw_d_w (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_81">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_81">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_81">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_81">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vhsubw_du_wu-__m128i-a-__m128i-b">__m128i __lsx_vhsubw_du_wu (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_82">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_82">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_82">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_82">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vhsubw_q_d-__m128i-a-__m128i-b">__m128i __lsx_vhsubw_q_d (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_83">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_83">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_83">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_83">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vhsubw_qu_du-__m128i-a-__m128i-b">__m128i __lsx_vhsubw_qu_du (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_84">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_84">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_84">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_84">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmadd_b-__m128i-a-__m128i-b-__m128i-c">__m128i __lsx_vmadd_b (__m128i a, __m128i b, __m128i c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_85">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_85">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_85">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_85">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmadd_h-__m128i-a-__m128i-b-__m128i-c">__m128i __lsx_vmadd_h (__m128i a, __m128i b, __m128i c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_86">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_86">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_86">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_86">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmadd_w-__m128i-a-__m128i-b-__m128i-c">__m128i __lsx_vmadd_w (__m128i a, __m128i b, __m128i c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_87">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_87">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_87">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_87">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmadd_d-__m128i-a-__m128i-b-__m128i-c">__m128i __lsx_vmadd_d (__m128i a, __m128i b, __m128i c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_88">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_88">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_88">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_88">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmaddwev_h_b-__m128i-a-__m128i-b-__m128i-c">__m128i __lsx_vmaddwev_h_b (__m128i a, __m128i b, __m128i c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_89">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_89">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_89">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_89">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmaddwev_h_bu-__m128i-a-__m128i-b-__m128i-c">__m128i __lsx_vmaddwev_h_bu (__m128i a, __m128i b, __m128i c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_90">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_90">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_90">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_90">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmaddwev_h_bu_b-__m128i-a-__m128i-b-__m128i-c">__m128i __lsx_vmaddwev_h_bu_b (__m128i a, __m128i b, __m128i c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_91">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_91">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_91">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_91">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmaddwev_w_h-__m128i-a-__m128i-b-__m128i-c">__m128i __lsx_vmaddwev_w_h (__m128i a, __m128i b, __m128i c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_92">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_92">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_92">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_92">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmaddwev_w_hu-__m128i-a-__m128i-b-__m128i-c">__m128i __lsx_vmaddwev_w_hu (__m128i a, __m128i b, __m128i c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_93">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_93">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_93">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_93">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmaddwev_w_hu_h-__m128i-a-__m128i-b-__m128i-c">__m128i __lsx_vmaddwev_w_hu_h (__m128i a, __m128i b, __m128i c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_94">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_94">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_94">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_94">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmaddwev_d_w-__m128i-a-__m128i-b-__m128i-c">__m128i __lsx_vmaddwev_d_w (__m128i a, __m128i b, __m128i c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_95">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_95">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_95">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_95">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmaddwev_d_wu-__m128i-a-__m128i-b-__m128i-c">__m128i __lsx_vmaddwev_d_wu (__m128i a, __m128i b, __m128i c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_96">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_96">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_96">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_96">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmaddwev_d_wu_w-__m128i-a-__m128i-b-__m128i-c">__m128i __lsx_vmaddwev_d_wu_w (__m128i a, __m128i b, __m128i c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_97">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_97">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_97">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_97">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmaddwev_q_d-__m128i-a-__m128i-b-__m128i-c">__m128i __lsx_vmaddwev_q_d (__m128i a, __m128i b, __m128i c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_98">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_98">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_98">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_98">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmaddwev_q_du-__m128i-a-__m128i-b-__m128i-c">__m128i __lsx_vmaddwev_q_du (__m128i a, __m128i b, __m128i c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_99">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_99">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_99">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_99">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmaddwev_q_du_d-__m128i-a-__m128i-b-__m128i-c">__m128i __lsx_vmaddwev_q_du_d (__m128i a, __m128i b, __m128i c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_100">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_100">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_100">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_100">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmaddwod_h_b-__m128i-a-__m128i-b-__m128i-c">__m128i __lsx_vmaddwod_h_b (__m128i a, __m128i b, __m128i c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_101">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_101">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_101">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_101">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmaddwod_h_bu-__m128i-a-__m128i-b-__m128i-c">__m128i __lsx_vmaddwod_h_bu (__m128i a, __m128i b, __m128i c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_102">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_102">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_102">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_102">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmaddwod_h_bu_b-__m128i-a-__m128i-b-__m128i-c">__m128i __lsx_vmaddwod_h_bu_b (__m128i a, __m128i b, __m128i c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_103">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_103">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_103">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_103">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmaddwod_w_h-__m128i-a-__m128i-b-__m128i-c">__m128i __lsx_vmaddwod_w_h (__m128i a, __m128i b, __m128i c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_104">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_104">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_104">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_104">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmaddwod_w_hu-__m128i-a-__m128i-b-__m128i-c">__m128i __lsx_vmaddwod_w_hu (__m128i a, __m128i b, __m128i c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_105">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_105">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_105">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_105">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmaddwod_w_hu_h-__m128i-a-__m128i-b-__m128i-c">__m128i __lsx_vmaddwod_w_hu_h (__m128i a, __m128i b, __m128i c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_106">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_106">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_106">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_106">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmaddwod_d_w-__m128i-a-__m128i-b-__m128i-c">__m128i __lsx_vmaddwod_d_w (__m128i a, __m128i b, __m128i c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_107">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_107">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_107">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_107">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmaddwod_d_wu-__m128i-a-__m128i-b-__m128i-c">__m128i __lsx_vmaddwod_d_wu (__m128i a, __m128i b, __m128i c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_108">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_108">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_108">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_108">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmaddwod_d_wu_w-__m128i-a-__m128i-b-__m128i-c">__m128i __lsx_vmaddwod_d_wu_w (__m128i a, __m128i b, __m128i c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_109">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_109">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_109">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_109">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmaddwod_q_d-__m128i-a-__m128i-b-__m128i-c">__m128i __lsx_vmaddwod_q_d (__m128i a, __m128i b, __m128i c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_110">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_110">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_110">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_110">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmaddwod_q_du-__m128i-a-__m128i-b-__m128i-c">__m128i __lsx_vmaddwod_q_du (__m128i a, __m128i b, __m128i c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_111">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_111">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_111">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_111">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmaddwod_q_du_d-__m128i-a-__m128i-b-__m128i-c">__m128i __lsx_vmaddwod_q_du_d (__m128i a, __m128i b, __m128i c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_112">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_112">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_112">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_112">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmax_b-__m128i-a-__m128i-b">__m128i __lsx_vmax_b (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_113">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_113">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_113">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_113">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmax_bu-__m128i-a-__m128i-b">__m128i __lsx_vmax_bu (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_114">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_114">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_114">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_114">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmax_h-__m128i-a-__m128i-b">__m128i __lsx_vmax_h (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_115">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_115">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_115">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_115">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmax_hu-__m128i-a-__m128i-b">__m128i __lsx_vmax_hu (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_116">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_116">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_116">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_116">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmax_w-__m128i-a-__m128i-b">__m128i __lsx_vmax_w (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_117">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_117">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_117">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_117">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmax_wu-__m128i-a-__m128i-b">__m128i __lsx_vmax_wu (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_118">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_118">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_118">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_118">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmax_d-__m128i-a-__m128i-b">__m128i __lsx_vmax_d (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_119">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_119">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_119">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_119">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmax_du-__m128i-a-__m128i-b">__m128i __lsx_vmax_du (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_120">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_120">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_120">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_120">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmaxi_b-__m128i-a-imm_n16_15-imm">__m128i __lsx_vmaxi_b (__m128i a, imm_n16_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_121">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_121">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_121">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_121">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmaxi_bu-__m128i-a-imm0_31-imm">__m128i __lsx_vmaxi_bu (__m128i a, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_122">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_122">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_122">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_122">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmaxi_h-__m128i-a-imm_n16_15-imm">__m128i __lsx_vmaxi_h (__m128i a, imm_n16_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_123">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_123">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_123">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_123">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmaxi_hu-__m128i-a-imm0_31-imm">__m128i __lsx_vmaxi_hu (__m128i a, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_124">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_124">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_124">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_124">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmaxi_w-__m128i-a-imm_n16_15-imm">__m128i __lsx_vmaxi_w (__m128i a, imm_n16_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_125">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_125">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_125">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_125">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmaxi_wu-__m128i-a-imm0_31-imm">__m128i __lsx_vmaxi_wu (__m128i a, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_126">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_126">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_126">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_126">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmaxi_d-__m128i-a-imm_n16_15-imm">__m128i __lsx_vmaxi_d (__m128i a, imm_n16_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_127">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_127">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_127">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_127">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmaxi_du-__m128i-a-imm0_31-imm">__m128i __lsx_vmaxi_du (__m128i a, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_128">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_128">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_128">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_128">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmin_b-__m128i-a-__m128i-b">__m128i __lsx_vmin_b (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_129">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_129">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_129">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_129">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmin_bu-__m128i-a-__m128i-b">__m128i __lsx_vmin_bu (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_130">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_130">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_130">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_130">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmin_h-__m128i-a-__m128i-b">__m128i __lsx_vmin_h (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_131">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_131">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_131">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_131">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmin_hu-__m128i-a-__m128i-b">__m128i __lsx_vmin_hu (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_132">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_132">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_132">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_132">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmin_w-__m128i-a-__m128i-b">__m128i __lsx_vmin_w (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_133">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_133">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_133">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_133">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmin_wu-__m128i-a-__m128i-b">__m128i __lsx_vmin_wu (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_134">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_134">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_134">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_134">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmin_d-__m128i-a-__m128i-b">__m128i __lsx_vmin_d (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_135">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_135">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_135">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_135">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmin_du-__m128i-a-__m128i-b">__m128i __lsx_vmin_du (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_136">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_136">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_136">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_136">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmini_b-__m128i-a-imm_n16_15-imm">__m128i __lsx_vmini_b (__m128i a, imm_n16_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_137">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_137">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_137">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_137">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmini_bu-__m128i-a-imm0_31-imm">__m128i __lsx_vmini_bu (__m128i a, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_138">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_138">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_138">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_138">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmini_h-__m128i-a-imm_n16_15-imm">__m128i __lsx_vmini_h (__m128i a, imm_n16_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_139">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_139">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_139">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_139">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmini_hu-__m128i-a-imm0_31-imm">__m128i __lsx_vmini_hu (__m128i a, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_140">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_140">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_140">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_140">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmini_w-__m128i-a-imm_n16_15-imm">__m128i __lsx_vmini_w (__m128i a, imm_n16_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_141">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_141">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_141">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_141">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmini_wu-__m128i-a-imm0_31-imm">__m128i __lsx_vmini_wu (__m128i a, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_142">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_142">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_142">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_142">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmini_d-__m128i-a-imm_n16_15-imm">__m128i __lsx_vmini_d (__m128i a, imm_n16_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_143">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_143">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_143">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_143">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmini_du-__m128i-a-imm0_31-imm">__m128i __lsx_vmini_du (__m128i a, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_144">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_144">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_144">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_144">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmod_b-__m128i-a-__m128i-b">__m128i __lsx_vmod_b (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_145">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_145">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_145">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_145">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmod_bu-__m128i-a-__m128i-b">__m128i __lsx_vmod_bu (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_146">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_146">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_146">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_146">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmod_h-__m128i-a-__m128i-b">__m128i __lsx_vmod_h (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_147">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_147">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_147">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_147">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmod_hu-__m128i-a-__m128i-b">__m128i __lsx_vmod_hu (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_148">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_148">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_148">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_148">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmod_w-__m128i-a-__m128i-b">__m128i __lsx_vmod_w (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_149">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_149">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_149">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_149">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmod_wu-__m128i-a-__m128i-b">__m128i __lsx_vmod_wu (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_150">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_150">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_150">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_150">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmod_d-__m128i-a-__m128i-b">__m128i __lsx_vmod_d (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_151">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_151">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_151">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_151">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmod_du-__m128i-a-__m128i-b">__m128i __lsx_vmod_du (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_152">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_152">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_152">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_152">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmsub_b-__m128i-a-__m128i-b-__m128i-c">__m128i __lsx_vmsub_b (__m128i a, __m128i b, __m128i c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_153">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_153">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_153">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_153">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmsub_h-__m128i-a-__m128i-b-__m128i-c">__m128i __lsx_vmsub_h (__m128i a, __m128i b, __m128i c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_154">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_154">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_154">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_154">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmsub_w-__m128i-a-__m128i-b-__m128i-c">__m128i __lsx_vmsub_w (__m128i a, __m128i b, __m128i c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_155">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_155">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_155">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_155">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmsub_d-__m128i-a-__m128i-b-__m128i-c">__m128i __lsx_vmsub_d (__m128i a, __m128i b, __m128i c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_156">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_156">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_156">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_156">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmuh_b-__m128i-a-__m128i-b">__m128i __lsx_vmuh_b (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_157">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_157">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_157">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_157">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmuh_bu-__m128i-a-__m128i-b">__m128i __lsx_vmuh_bu (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_158">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_158">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_158">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_158">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmuh_h-__m128i-a-__m128i-b">__m128i __lsx_vmuh_h (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_159">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_159">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_159">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_159">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmuh_hu-__m128i-a-__m128i-b">__m128i __lsx_vmuh_hu (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_160">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_160">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_160">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_160">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmuh_w-__m128i-a-__m128i-b">__m128i __lsx_vmuh_w (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_161">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_161">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_161">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_161">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmuh_wu-__m128i-a-__m128i-b">__m128i __lsx_vmuh_wu (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_162">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_162">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_162">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_162">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmuh_d-__m128i-a-__m128i-b">__m128i __lsx_vmuh_d (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_163">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_163">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_163">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_163">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmuh_du-__m128i-a-__m128i-b">__m128i __lsx_vmuh_du (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_164">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_164">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_164">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_164">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmul_b-__m128i-a-__m128i-b">__m128i __lsx_vmul_b (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_165">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_165">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_165">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_165">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmul_h-__m128i-a-__m128i-b">__m128i __lsx_vmul_h (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_166">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_166">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_166">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_166">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmul_w-__m128i-a-__m128i-b">__m128i __lsx_vmul_w (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_167">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_167">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_167">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_167">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmul_d-__m128i-a-__m128i-b">__m128i __lsx_vmul_d (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_168">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_168">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_168">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_168">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmulwev_h_b-__m128i-a-__m128i-b">__m128i __lsx_vmulwev_h_b (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_169">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_169">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_169">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_169">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmulwev_h_bu-__m128i-a-__m128i-b">__m128i __lsx_vmulwev_h_bu (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_170">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_170">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_170">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_170">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmulwev_h_bu_b-__m128i-a-__m128i-b">__m128i __lsx_vmulwev_h_bu_b (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_171">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_171">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_171">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_171">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmulwev_w_h-__m128i-a-__m128i-b">__m128i __lsx_vmulwev_w_h (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_172">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_172">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_172">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_172">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmulwev_w_hu-__m128i-a-__m128i-b">__m128i __lsx_vmulwev_w_hu (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_173">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_173">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_173">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_173">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmulwev_w_hu_h-__m128i-a-__m128i-b">__m128i __lsx_vmulwev_w_hu_h (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_174">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_174">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_174">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_174">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmulwev_d_w-__m128i-a-__m128i-b">__m128i __lsx_vmulwev_d_w (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_175">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_175">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_175">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_175">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmulwev_d_wu-__m128i-a-__m128i-b">__m128i __lsx_vmulwev_d_wu (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_176">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_176">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_176">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_176">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmulwev_d_wu_w-__m128i-a-__m128i-b">__m128i __lsx_vmulwev_d_wu_w (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_177">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_177">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_177">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_177">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmulwev_q_d-__m128i-a-__m128i-b">__m128i __lsx_vmulwev_q_d (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_178">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_178">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_178">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_178">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmulwev_q_du-__m128i-a-__m128i-b">__m128i __lsx_vmulwev_q_du (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_179">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_179">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_179">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_179">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmulwev_q_du_d-__m128i-a-__m128i-b">__m128i __lsx_vmulwev_q_du_d (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_180">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_180">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_180">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_180">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmulwod_h_b-__m128i-a-__m128i-b">__m128i __lsx_vmulwod_h_b (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_181">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_181">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_181">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_181">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmulwod_h_bu-__m128i-a-__m128i-b">__m128i __lsx_vmulwod_h_bu (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_182">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_182">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_182">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_182">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmulwod_h_bu_b-__m128i-a-__m128i-b">__m128i __lsx_vmulwod_h_bu_b (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_183">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_183">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_183">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_183">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmulwod_w_h-__m128i-a-__m128i-b">__m128i __lsx_vmulwod_w_h (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_184">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_184">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_184">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_184">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmulwod_w_hu-__m128i-a-__m128i-b">__m128i __lsx_vmulwod_w_hu (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_185">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_185">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_185">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_185">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmulwod_w_hu_h-__m128i-a-__m128i-b">__m128i __lsx_vmulwod_w_hu_h (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_186">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_186">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_186">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_186">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmulwod_d_w-__m128i-a-__m128i-b">__m128i __lsx_vmulwod_d_w (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_187">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_187">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_187">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_187">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmulwod_d_wu-__m128i-a-__m128i-b">__m128i __lsx_vmulwod_d_wu (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_188">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_188">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_188">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_188">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmulwod_d_wu_w-__m128i-a-__m128i-b">__m128i __lsx_vmulwod_d_wu_w (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_189">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_189">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_189">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_189">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmulwod_q_d-__m128i-a-__m128i-b">__m128i __lsx_vmulwod_q_d (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_190">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_190">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_190">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_190">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmulwod_q_du-__m128i-a-__m128i-b">__m128i __lsx_vmulwod_q_du (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_191">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_191">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_191">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_191">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmulwod_q_du_d-__m128i-a-__m128i-b">__m128i __lsx_vmulwod_q_du_d (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_192">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_192">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_192">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_192">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vneg_b-__m128i-a">__m128i __lsx_vneg_b (__m128i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_193">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_193">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_193">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_193">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vneg_h-__m128i-a">__m128i __lsx_vneg_h (__m128i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_194">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_194">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_194">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_194">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vneg_w-__m128i-a">__m128i __lsx_vneg_w (__m128i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_195">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_195">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_195">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_195">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vneg_d-__m128i-a">__m128i __lsx_vneg_d (__m128i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_196">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_196">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_196">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_196">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsadd_b-__m128i-a-__m128i-b">__m128i __lsx_vsadd_b (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_197">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_197">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_197">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_197">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsadd_bu-__m128i-a-__m128i-b">__m128i __lsx_vsadd_bu (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_198">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_198">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_198">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_198">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsadd_h-__m128i-a-__m128i-b">__m128i __lsx_vsadd_h (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_199">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_199">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_199">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_199">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsadd_hu-__m128i-a-__m128i-b">__m128i __lsx_vsadd_hu (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_200">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_200">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_200">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_200">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsadd_w-__m128i-a-__m128i-b">__m128i __lsx_vsadd_w (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_201">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_201">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_201">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_201">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsadd_wu-__m128i-a-__m128i-b">__m128i __lsx_vsadd_wu (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_202">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_202">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_202">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_202">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsadd_d-__m128i-a-__m128i-b">__m128i __lsx_vsadd_d (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_203">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_203">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_203">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_203">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsadd_du-__m128i-a-__m128i-b">__m128i __lsx_vsadd_du (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_204">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_204">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_204">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_204">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vssub_b-__m128i-a-__m128i-b">__m128i __lsx_vssub_b (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_205">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_205">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_205">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_205">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vssub_bu-__m128i-a-__m128i-b">__m128i __lsx_vssub_bu (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_206">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_206">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_206">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_206">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vssub_h-__m128i-a-__m128i-b">__m128i __lsx_vssub_h (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_207">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_207">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_207">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_207">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vssub_hu-__m128i-a-__m128i-b">__m128i __lsx_vssub_hu (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_208">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_208">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_208">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_208">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vssub_w-__m128i-a-__m128i-b">__m128i __lsx_vssub_w (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_209">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_209">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_209">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_209">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vssub_wu-__m128i-a-__m128i-b">__m128i __lsx_vssub_wu (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_210">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_210">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_210">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_210">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vssub_d-__m128i-a-__m128i-b">__m128i __lsx_vssub_d (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_211">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_211">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_211">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_211">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vssub_du-__m128i-a-__m128i-b">__m128i __lsx_vssub_du (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_212">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_212">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_212">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_212">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsub_b-__m128i-a-__m128i-b">__m128i __lsx_vsub_b (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_213">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_213">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_213">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_213">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsub_h-__m128i-a-__m128i-b">__m128i __lsx_vsub_h (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_214">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_214">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_214">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_214">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsub_w-__m128i-a-__m128i-b">__m128i __lsx_vsub_w (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_215">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_215">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_215">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_215">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsub_d-__m128i-a-__m128i-b">__m128i __lsx_vsub_d (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_216">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_216">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_216">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_216">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsub_q-__m128i-a-__m128i-b">__m128i __lsx_vsub_q (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_217">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_217">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_217">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_217">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsubi_bu-__m128i-a-imm0_31-imm">__m128i __lsx_vsubi_bu (__m128i a, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_218">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_218">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_218">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_218">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsubi_hu-__m128i-a-imm0_31-imm">__m128i __lsx_vsubi_hu (__m128i a, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_219">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_219">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_219">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_219">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsubi_wu-__m128i-a-imm0_31-imm">__m128i __lsx_vsubi_wu (__m128i a, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_220">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_220">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_220">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_220">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsubi_du-__m128i-a-imm0_31-imm">__m128i __lsx_vsubi_du (__m128i a, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_221">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_221">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_221">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_221">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsubwev_h_b-__m128i-a-__m128i-b">__m128i __lsx_vsubwev_h_b (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_222">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_222">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_222">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_222">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsubwev_h_bu-__m128i-a-__m128i-b">__m128i __lsx_vsubwev_h_bu (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_223">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_223">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_223">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_223">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsubwev_w_h-__m128i-a-__m128i-b">__m128i __lsx_vsubwev_w_h (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_224">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_224">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_224">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_224">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsubwev_w_hu-__m128i-a-__m128i-b">__m128i __lsx_vsubwev_w_hu (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_225">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_225">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_225">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_225">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsubwev_d_w-__m128i-a-__m128i-b">__m128i __lsx_vsubwev_d_w (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_226">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_226">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_226">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_226">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsubwev_d_wu-__m128i-a-__m128i-b">__m128i __lsx_vsubwev_d_wu (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_227">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_227">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_227">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_227">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsubwev_q_d-__m128i-a-__m128i-b">__m128i __lsx_vsubwev_q_d (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_228">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_228">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_228">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_228">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsubwev_q_du-__m128i-a-__m128i-b">__m128i __lsx_vsubwev_q_du (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_229">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_229">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_229">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_229">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsubwod_h_b-__m128i-a-__m128i-b">__m128i __lsx_vsubwod_h_b (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_230">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_230">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_230">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_230">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsubwod_h_bu-__m128i-a-__m128i-b">__m128i __lsx_vsubwod_h_bu (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_231">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_231">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_231">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_231">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsubwod_w_h-__m128i-a-__m128i-b">__m128i __lsx_vsubwod_w_h (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_232">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_232">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_232">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_232">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsubwod_w_hu-__m128i-a-__m128i-b">__m128i __lsx_vsubwod_w_hu (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_233">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_233">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_233">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_233">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsubwod_d_w-__m128i-a-__m128i-b">__m128i __lsx_vsubwod_d_w (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_234">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_234">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_234">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_234">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsubwod_d_wu-__m128i-a-__m128i-b">__m128i __lsx_vsubwod_d_wu (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_235">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_235">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_235">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_235">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsubwod_q_d-__m128i-a-__m128i-b">__m128i __lsx_vsubwod_q_d (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_236">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_236">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_236">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_236">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsubwod_q_du-__m128i-a-__m128i-b">__m128i __lsx_vsubwod_q_du (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_237">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_237">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_237">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_237">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    </ul>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../logical/">Logical</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../memory/">Memory Load & Store</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../misc/">Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../permutation/">Permutation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../shift/">Shift</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../shuffling/">Shuffling</a>
+                  </li>
+              </ul>
+      </div>
+    </div>
+    </nav>
+
+    <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
+      <nav class="wy-nav-top" role="navigation" aria-label="Mobile navigation menu">
+          <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
+          <a href="../..">Unofficial LoongArch Intrinsics Guide</a>
+        
+      </nav>
+      <div class="wy-nav-content">
+        <div class="rst-content"><div role="navigation" aria-label="breadcrumbs navigation">
+  <ul class="wy-breadcrumbs">
+    <li><a href="../.." class="icon icon-home" aria-label="Docs"></a></li>
+          <li class="breadcrumb-item">Lsx</li>
+      <li class="breadcrumb-item active">Integer Computation</li>
+    <li class="wy-breadcrumbs-aside">
+    </li>
+  </ul>
+  <hr/>
+</div>
+          <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
+            <div class="section" itemprop="articleBody">
+              
+                <h1 id="integer-computation">Integer Computation</h1>
+<h2 id="__m128i-__lsx_vadd_b-__m128i-a-__m128i-b">__m128i __lsx_vadd_b (__m128i a, __m128i b)</h2>
+<h3 id="synopsis">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vadd_b (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vadd.b vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description">Description</h3>
+<p>Add 8-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = a.byte[i] + b.byte[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vadd_h-__m128i-a-__m128i-b">__m128i __lsx_vadd_h (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_1">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vadd_h (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vadd.h vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_1">Description</h3>
+<p>Add 16-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_1">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = a.half[i] + b.half[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_1">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vadd_w-__m128i-a-__m128i-b">__m128i __lsx_vadd_w (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_2">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vadd_w (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vadd.w vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_2">Description</h3>
+<p>Add 32-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_2">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = a.word[i] + b.word[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_2">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vadd_d-__m128i-a-__m128i-b">__m128i __lsx_vadd_d (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_3">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vadd_d (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vadd.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_3">Description</h3>
+<p>Add 64-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_3">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = a.dword[i] + b.dword[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_3">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vadd_q-__m128i-a-__m128i-b">__m128i __lsx_vadd_q (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_4">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vadd_q (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vadd.q vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_4">Description</h3>
+<p>Add 128-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_4">Operation</h3>
+<pre><code class="language-c++">dst.qword[0] = a.qword[0] + b.qword[0];
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_4">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vabsd_b-__m128i-a-__m128i-b">__m128i __lsx_vabsd_b (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_5">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vabsd_b (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vabsd.b vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_5">Description</h3>
+<p>Compute absolute difference of signed 8-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_5">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = ((s8)a.byte[i] &gt; (s8)b.byte[i]) ? (a.byte[i] - b.byte[i])
+                                                : (b.byte[i] - a.byte[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_5">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vabsd_bu-__m128i-a-__m128i-b">__m128i __lsx_vabsd_bu (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_6">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vabsd_bu (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vabsd.bu vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_6">Description</h3>
+<p>Compute absolute difference of unsigned 8-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_6">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = ((u8)a.byte[i] &gt; (u8)b.byte[i]) ? (a.byte[i] - b.byte[i])
+                                                : (b.byte[i] - a.byte[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_6">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vabsd_h-__m128i-a-__m128i-b">__m128i __lsx_vabsd_h (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_7">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vabsd_h (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vabsd.h vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_7">Description</h3>
+<p>Compute absolute difference of signed 16-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_7">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = ((s16)a.half[i] &gt; (s16)b.half[i]) ? (a.half[i] - b.half[i])
+                                                  : (b.half[i] - a.half[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_7">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vabsd_hu-__m128i-a-__m128i-b">__m128i __lsx_vabsd_hu (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_8">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vabsd_hu (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vabsd.hu vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_8">Description</h3>
+<p>Compute absolute difference of unsigned 16-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_8">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = ((u16)a.half[i] &gt; (u16)b.half[i]) ? (a.half[i] - b.half[i])
+                                                  : (b.half[i] - a.half[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_8">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vabsd_w-__m128i-a-__m128i-b">__m128i __lsx_vabsd_w (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_9">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vabsd_w (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vabsd.w vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_9">Description</h3>
+<p>Compute absolute difference of signed 32-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_9">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = ((s32)a.word[i] &gt; (s32)b.word[i]) ? (a.word[i] - b.word[i])
+                                                  : (b.word[i] - a.word[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_9">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vabsd_wu-__m128i-a-__m128i-b">__m128i __lsx_vabsd_wu (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_10">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vabsd_wu (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vabsd.wu vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_10">Description</h3>
+<p>Compute absolute difference of unsigned 32-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_10">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = ((u32)a.word[i] &gt; (u32)b.word[i]) ? (a.word[i] - b.word[i])
+                                                  : (b.word[i] - a.word[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_10">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vabsd_d-__m128i-a-__m128i-b">__m128i __lsx_vabsd_d (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_11">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vabsd_d (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vabsd.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_11">Description</h3>
+<p>Compute absolute difference of signed 64-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_11">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = ((s64)a.dword[i] &gt; (s64)b.dword[i])
+                     ? (a.dword[i] - b.dword[i])
+                     : (b.dword[i] - a.dword[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_11">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vabsd_du-__m128i-a-__m128i-b">__m128i __lsx_vabsd_du (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_12">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vabsd_du (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vabsd.du vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_12">Description</h3>
+<p>Compute absolute difference of unsigned 64-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_12">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = ((u64)a.dword[i] &gt; (u64)b.dword[i])
+                     ? (a.dword[i] - b.dword[i])
+                     : (b.dword[i] - a.dword[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_12">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vadda_b-__m128i-a-__m128i-b">__m128i __lsx_vadda_b (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_13">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vadda_b (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vadda.b vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_13">Description</h3>
+<p>Add absolute of 8-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_13">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = abs((s8)a.byte[i]) + abs((s8)b.byte[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_13">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vadda_h-__m128i-a-__m128i-b">__m128i __lsx_vadda_h (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_14">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vadda_h (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vadda.h vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_14">Description</h3>
+<p>Add absolute of 16-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_14">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = abs((s16)a.half[i]) + abs((s16)b.half[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_14">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vadda_w-__m128i-a-__m128i-b">__m128i __lsx_vadda_w (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_15">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vadda_w (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vadda.w vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_15">Description</h3>
+<p>Add absolute of 32-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_15">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = abs((s32)a.word[i]) + abs((s32)b.word[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_15">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vadda_d-__m128i-a-__m128i-b">__m128i __lsx_vadda_d (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_16">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vadda_d (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vadda.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_16">Description</h3>
+<p>Add absolute of 64-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_16">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = abs((s64)a.dword[i]) + abs((s64)b.dword[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_16">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vaddi_bu-__m128i-a-imm0_31-imm">__m128i __lsx_vaddi_bu (__m128i a, imm0_31 imm)</h2>
+<h3 id="synopsis_17">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vaddi_bu (__m128i a, imm0_31 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vaddi.bu vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_17">Description</h3>
+<p>Add 8-bit elements in <code>a</code> and <code>imm</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_17">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = a.byte[i] + imm;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_17">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vaddi_hu-__m128i-a-imm0_31-imm">__m128i __lsx_vaddi_hu (__m128i a, imm0_31 imm)</h2>
+<h3 id="synopsis_18">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vaddi_hu (__m128i a, imm0_31 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vaddi.hu vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_18">Description</h3>
+<p>Add 16-bit elements in <code>a</code> and <code>imm</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_18">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = a.half[i] + imm;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_18">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vaddi_wu-__m128i-a-imm0_31-imm">__m128i __lsx_vaddi_wu (__m128i a, imm0_31 imm)</h2>
+<h3 id="synopsis_19">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vaddi_wu (__m128i a, imm0_31 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vaddi.wu vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_19">Description</h3>
+<p>Add 32-bit elements in <code>a</code> and <code>imm</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_19">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = a.word[i] + imm;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_19">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vaddi_du-__m128i-a-imm0_31-imm">__m128i __lsx_vaddi_du (__m128i a, imm0_31 imm)</h2>
+<h3 id="synopsis_20">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vaddi_du (__m128i a, imm0_31 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vaddi.du vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_20">Description</h3>
+<p>Add 64-bit elements in <code>a</code> and <code>imm</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_20">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = a.dword[i] + imm;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_20">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vaddwev_h_b-__m128i-a-__m128i-b">__m128i __lsx_vaddwev_h_b (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_21">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vaddwev_h_b (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vaddwev.h.b vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_21">Description</h3>
+<p>Add even-positioned signed 8-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 16-bit result in <code>dst</code>.</p>
+<h3 id="operation_21">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = (s16)(s8)a.byte[2 * i] + (s16)(s8)b.byte[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_21">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vaddwev_h_bu-__m128i-a-__m128i-b">__m128i __lsx_vaddwev_h_bu (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_22">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vaddwev_h_bu (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vaddwev.h.bu vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_22">Description</h3>
+<p>Add even-positioned unsigned 8-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 16-bit result in <code>dst</code>.</p>
+<h3 id="operation_22">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = (u16)(u8)a.byte[2 * i] + (u16)(u8)b.byte[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_22">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vaddwev_h_bu_b-__m128i-a-__m128i-b">__m128i __lsx_vaddwev_h_bu_b (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_23">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vaddwev_h_bu_b (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vaddwev.h.bu.b vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_23">Description</h3>
+<p>Add even-positioned unsigned 8-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 16-bit result in <code>dst</code>.</p>
+<h3 id="operation_23">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = (u16)(u8)a.byte[2 * i] + (s16)(s8)b.byte[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_23">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vaddwev_w_h-__m128i-a-__m128i-b">__m128i __lsx_vaddwev_w_h (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_24">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vaddwev_w_h (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vaddwev.w.h vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_24">Description</h3>
+<p>Add even-positioned signed 16-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 32-bit result in <code>dst</code>.</p>
+<h3 id="operation_24">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = (s32)(s16)a.half[2 * i] + (s32)(s16)b.half[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_24">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vaddwev_w_hu-__m128i-a-__m128i-b">__m128i __lsx_vaddwev_w_hu (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_25">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vaddwev_w_hu (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vaddwev.w.hu vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_25">Description</h3>
+<p>Add even-positioned unsigned 16-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 32-bit result in <code>dst</code>.</p>
+<h3 id="operation_25">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = (u32)(u16)a.half[2 * i] + (u32)(u16)b.half[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_25">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vaddwev_w_hu_h-__m128i-a-__m128i-b">__m128i __lsx_vaddwev_w_hu_h (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_26">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vaddwev_w_hu_h (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vaddwev.w.hu.h vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_26">Description</h3>
+<p>Add even-positioned unsigned 16-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 32-bit result in <code>dst</code>.</p>
+<h3 id="operation_26">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = (u32)(u16)a.half[2 * i] + (s32)(s16)b.half[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_26">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vaddwev_d_w-__m128i-a-__m128i-b">__m128i __lsx_vaddwev_d_w (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_27">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vaddwev_d_w (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vaddwev.d.w vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_27">Description</h3>
+<p>Add even-positioned signed 32-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 64-bit result in <code>dst</code>.</p>
+<h3 id="operation_27">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = (s64)(s32)a.word[2 * i] + (s64)(s32)b.word[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_27">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vaddwev_d_wu-__m128i-a-__m128i-b">__m128i __lsx_vaddwev_d_wu (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_28">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vaddwev_d_wu (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vaddwev.d.wu vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_28">Description</h3>
+<p>Add even-positioned unsigned 32-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 64-bit result in <code>dst</code>.</p>
+<h3 id="operation_28">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = (u64)(u32)a.word[2 * i] + (u64)(u32)b.word[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_28">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vaddwev_d_wu_w-__m128i-a-__m128i-b">__m128i __lsx_vaddwev_d_wu_w (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_29">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vaddwev_d_wu_w (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vaddwev.d.wu.w vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_29">Description</h3>
+<p>Add even-positioned unsigned 32-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 64-bit result in <code>dst</code>.</p>
+<h3 id="operation_29">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = (u64)(u32)a.word[2 * i] + (s64)(s32)b.word[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_29">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vaddwev_q_d-__m128i-a-__m128i-b">__m128i __lsx_vaddwev_q_d (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_30">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vaddwev_q_d (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vaddwev.q.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_30">Description</h3>
+<p>Add even-positioned signed 64-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 128-bit result in <code>dst</code>.</p>
+<h3 id="operation_30">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 1; i++) {
+  dst.qword[i] = (s128)(s64)a.dword[2 * i] + (s128)(s64)b.dword[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_30">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vaddwev_q_du-__m128i-a-__m128i-b">__m128i __lsx_vaddwev_q_du (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_31">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vaddwev_q_du (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vaddwev.q.du vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_31">Description</h3>
+<p>Add even-positioned unsigned 64-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 128-bit result in <code>dst</code>.</p>
+<h3 id="operation_31">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 1; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[2 * i] + (u128)(u64)b.dword[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_31">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vaddwev_q_du_d-__m128i-a-__m128i-b">__m128i __lsx_vaddwev_q_du_d (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_32">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vaddwev_q_du_d (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vaddwev.q.du.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_32">Description</h3>
+<p>Add even-positioned unsigned 64-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 128-bit result in <code>dst</code>.</p>
+<h3 id="operation_32">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 1; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[2 * i] + (s128)(s64)b.dword[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_32">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vaddwod_h_b-__m128i-a-__m128i-b">__m128i __lsx_vaddwod_h_b (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_33">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vaddwod_h_b (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vaddwod.h.b vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_33">Description</h3>
+<p>Add odd-positioned signed 8-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 16-bit result in <code>dst</code>.</p>
+<h3 id="operation_33">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = (s16)(s8)a.byte[2 * i + 1] + (s16)(s8)b.byte[2 * i + 1];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_33">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vaddwod_h_bu-__m128i-a-__m128i-b">__m128i __lsx_vaddwod_h_bu (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_34">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vaddwod_h_bu (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vaddwod.h.bu vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_34">Description</h3>
+<p>Add odd-positioned unsigned 8-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 16-bit result in <code>dst</code>.</p>
+<h3 id="operation_34">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = (u16)(u8)a.byte[2 * i + 1] + (u16)(u8)b.byte[2 * i + 1];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_34">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vaddwod_h_bu_b-__m128i-a-__m128i-b">__m128i __lsx_vaddwod_h_bu_b (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_35">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vaddwod_h_bu_b (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vaddwod.h.bu.b vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_35">Description</h3>
+<p>Add odd-positioned unsigned 8-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 16-bit result in <code>dst</code>.</p>
+<h3 id="operation_35">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = (u16)(u8)a.byte[2 * i + 1] + (s16)(s8)b.byte[2 * i + 1];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_35">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vaddwod_w_h-__m128i-a-__m128i-b">__m128i __lsx_vaddwod_w_h (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_36">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vaddwod_w_h (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vaddwod.w.h vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_36">Description</h3>
+<p>Add odd-positioned signed 16-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 32-bit result in <code>dst</code>.</p>
+<h3 id="operation_36">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = (s32)(s16)a.half[2 * i + 1] + (s32)(s16)b.half[2 * i + 1];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_36">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vaddwod_w_hu-__m128i-a-__m128i-b">__m128i __lsx_vaddwod_w_hu (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_37">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vaddwod_w_hu (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vaddwod.w.hu vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_37">Description</h3>
+<p>Add odd-positioned unsigned 16-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 32-bit result in <code>dst</code>.</p>
+<h3 id="operation_37">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = (u32)(u16)a.half[2 * i + 1] + (u32)(u16)b.half[2 * i + 1];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_37">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vaddwod_w_hu_h-__m128i-a-__m128i-b">__m128i __lsx_vaddwod_w_hu_h (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_38">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vaddwod_w_hu_h (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vaddwod.w.hu.h vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_38">Description</h3>
+<p>Add odd-positioned unsigned 16-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 32-bit result in <code>dst</code>.</p>
+<h3 id="operation_38">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = (u32)(u16)a.half[2 * i + 1] + (s32)(s16)b.half[2 * i + 1];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_38">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vaddwod_d_w-__m128i-a-__m128i-b">__m128i __lsx_vaddwod_d_w (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_39">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vaddwod_d_w (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vaddwod.d.w vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_39">Description</h3>
+<p>Add odd-positioned signed 32-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 64-bit result in <code>dst</code>.</p>
+<h3 id="operation_39">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = (s64)(s32)a.word[2 * i + 1] + (s64)(s32)b.word[2 * i + 1];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_39">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vaddwod_d_wu-__m128i-a-__m128i-b">__m128i __lsx_vaddwod_d_wu (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_40">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vaddwod_d_wu (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vaddwod.d.wu vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_40">Description</h3>
+<p>Add odd-positioned unsigned 32-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 64-bit result in <code>dst</code>.</p>
+<h3 id="operation_40">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = (u64)(u32)a.word[2 * i + 1] + (u64)(u32)b.word[2 * i + 1];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_40">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vaddwod_d_wu_w-__m128i-a-__m128i-b">__m128i __lsx_vaddwod_d_wu_w (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_41">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vaddwod_d_wu_w (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vaddwod.d.wu.w vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_41">Description</h3>
+<p>Add odd-positioned unsigned 32-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 64-bit result in <code>dst</code>.</p>
+<h3 id="operation_41">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = (u64)(u32)a.word[2 * i + 1] + (s64)(s32)b.word[2 * i + 1];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_41">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vaddwod_q_d-__m128i-a-__m128i-b">__m128i __lsx_vaddwod_q_d (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_42">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vaddwod_q_d (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vaddwod.q.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_42">Description</h3>
+<p>Add odd-positioned signed 64-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 128-bit result in <code>dst</code>.</p>
+<h3 id="operation_42">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 1; i++) {
+  dst.qword[i] = (s128)(s64)a.dword[2 * i + 1] + (s128)(s64)b.dword[2 * i + 1];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_42">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vaddwod_q_du-__m128i-a-__m128i-b">__m128i __lsx_vaddwod_q_du (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_43">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vaddwod_q_du (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vaddwod.q.du vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_43">Description</h3>
+<p>Add odd-positioned unsigned 64-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 128-bit result in <code>dst</code>.</p>
+<h3 id="operation_43">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 1; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] + (u128)(u64)b.dword[2 * i + 1];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_43">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vaddwod_q_du_d-__m128i-a-__m128i-b">__m128i __lsx_vaddwod_q_du_d (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_44">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vaddwod_q_du_d (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vaddwod.q.du.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_44">Description</h3>
+<p>Add odd-positioned unsigned 64-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 128-bit result in <code>dst</code>.</p>
+<h3 id="operation_44">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 1; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] + (s128)(s64)b.dword[2 * i + 1];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_44">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vavg_b-__m128i-a-__m128i-b">__m128i __lsx_vavg_b (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_45">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vavg_b (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vavg.b vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_45">Description</h3>
+<p>Compute the average (rounded towards negative infinity) of signed 8-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_45">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = ((s8)a.byte[i] &gt;&gt; 1) + ((s8)b.byte[i] &gt;&gt; 1) +
+                ((a.byte[i] &amp; b.byte[i]) &amp; 1);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_45">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vavg_bu-__m128i-a-__m128i-b">__m128i __lsx_vavg_bu (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_46">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vavg_bu (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vavg.bu vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_46">Description</h3>
+<p>Compute the average (rounded towards negative infinity) of unsigned 8-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_46">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = ((u8)a.byte[i] &gt;&gt; 1) + ((u8)b.byte[i] &gt;&gt; 1) +
+                ((a.byte[i] &amp; b.byte[i]) &amp; 1);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_46">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vavg_h-__m128i-a-__m128i-b">__m128i __lsx_vavg_h (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_47">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vavg_h (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vavg.h vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_47">Description</h3>
+<p>Compute the average (rounded towards negative infinity) of signed 16-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_47">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = ((s16)a.half[i] &gt;&gt; 1) + ((s16)b.half[i] &gt;&gt; 1) +
+                ((a.half[i] &amp; b.half[i]) &amp; 1);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_47">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vavg_hu-__m128i-a-__m128i-b">__m128i __lsx_vavg_hu (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_48">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vavg_hu (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vavg.hu vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_48">Description</h3>
+<p>Compute the average (rounded towards negative infinity) of unsigned 16-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_48">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = ((u16)a.half[i] &gt;&gt; 1) + ((u16)b.half[i] &gt;&gt; 1) +
+                ((a.half[i] &amp; b.half[i]) &amp; 1);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_48">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vavg_w-__m128i-a-__m128i-b">__m128i __lsx_vavg_w (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_49">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vavg_w (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vavg.w vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_49">Description</h3>
+<p>Compute the average (rounded towards negative infinity) of signed 32-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_49">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = ((s32)a.word[i] &gt;&gt; 1) + ((s32)b.word[i] &gt;&gt; 1) +
+                ((a.word[i] &amp; b.word[i]) &amp; 1);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_49">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vavg_wu-__m128i-a-__m128i-b">__m128i __lsx_vavg_wu (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_50">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vavg_wu (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vavg.wu vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_50">Description</h3>
+<p>Compute the average (rounded towards negative infinity) of unsigned 32-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_50">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = ((u32)a.word[i] &gt;&gt; 1) + ((u32)b.word[i] &gt;&gt; 1) +
+                ((a.word[i] &amp; b.word[i]) &amp; 1);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_50">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vavg_d-__m128i-a-__m128i-b">__m128i __lsx_vavg_d (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_51">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vavg_d (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vavg.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_51">Description</h3>
+<p>Compute the average (rounded towards negative infinity) of signed 64-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_51">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = ((s64)a.dword[i] &gt;&gt; 1) + ((s64)b.dword[i] &gt;&gt; 1) +
+                 ((a.dword[i] &amp; b.dword[i]) &amp; 1);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_51">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vavg_du-__m128i-a-__m128i-b">__m128i __lsx_vavg_du (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_52">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vavg_du (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vavg.du vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_52">Description</h3>
+<p>Compute the average (rounded towards negative infinity) of unsigned 64-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_52">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = ((u64)a.dword[i] &gt;&gt; 1) + ((u64)b.dword[i] &gt;&gt; 1) +
+                 ((a.dword[i] &amp; b.dword[i]) &amp; 1);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_52">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vavgr_b-__m128i-a-__m128i-b">__m128i __lsx_vavgr_b (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_53">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vavgr_b (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vavgr.b vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_53">Description</h3>
+<p>Compute the average (rounded towards positive infinity) of signed 8-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_53">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = ((s8)a.byte[i] &gt;&gt; 1) + ((s8)b.byte[i] &gt;&gt; 1) +
+                ((a.byte[i] | b.byte[i]) &amp; 1);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_53">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vavgr_bu-__m128i-a-__m128i-b">__m128i __lsx_vavgr_bu (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_54">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vavgr_bu (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vavgr.bu vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_54">Description</h3>
+<p>Compute the average (rounded towards positive infinity) of unsigned 8-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_54">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = ((u8)a.byte[i] &gt;&gt; 1) + ((u8)b.byte[i] &gt;&gt; 1) +
+                ((a.byte[i] | b.byte[i]) &amp; 1);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_54">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vavgr_h-__m128i-a-__m128i-b">__m128i __lsx_vavgr_h (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_55">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vavgr_h (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vavgr.h vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_55">Description</h3>
+<p>Compute the average (rounded towards positive infinity) of signed 16-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_55">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = ((s16)a.half[i] &gt;&gt; 1) + ((s16)b.half[i] &gt;&gt; 1) +
+                ((a.half[i] | b.half[i]) &amp; 1);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_55">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vavgr_hu-__m128i-a-__m128i-b">__m128i __lsx_vavgr_hu (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_56">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vavgr_hu (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vavgr.hu vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_56">Description</h3>
+<p>Compute the average (rounded towards positive infinity) of unsigned 16-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_56">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = ((u16)a.half[i] &gt;&gt; 1) + ((u16)b.half[i] &gt;&gt; 1) +
+                ((a.half[i] | b.half[i]) &amp; 1);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_56">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vavgr_w-__m128i-a-__m128i-b">__m128i __lsx_vavgr_w (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_57">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vavgr_w (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vavgr.w vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_57">Description</h3>
+<p>Compute the average (rounded towards positive infinity) of signed 32-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_57">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = ((s32)a.word[i] &gt;&gt; 1) + ((s32)b.word[i] &gt;&gt; 1) +
+                ((a.word[i] | b.word[i]) &amp; 1);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_57">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vavgr_wu-__m128i-a-__m128i-b">__m128i __lsx_vavgr_wu (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_58">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vavgr_wu (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vavgr.wu vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_58">Description</h3>
+<p>Compute the average (rounded towards positive infinity) of unsigned 32-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_58">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = ((u32)a.word[i] &gt;&gt; 1) + ((u32)b.word[i] &gt;&gt; 1) +
+                ((a.word[i] | b.word[i]) &amp; 1);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_58">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vavgr_d-__m128i-a-__m128i-b">__m128i __lsx_vavgr_d (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_59">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vavgr_d (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vavgr.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_59">Description</h3>
+<p>Compute the average (rounded towards positive infinity) of signed 64-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_59">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = ((s64)a.dword[i] &gt;&gt; 1) + ((s64)b.dword[i] &gt;&gt; 1) +
+                 ((a.dword[i] | b.dword[i]) &amp; 1);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_59">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vavgr_du-__m128i-a-__m128i-b">__m128i __lsx_vavgr_du (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_60">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vavgr_du (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vavgr.du vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_60">Description</h3>
+<p>Compute the average (rounded towards positive infinity) of unsigned 64-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_60">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = ((u64)a.dword[i] &gt;&gt; 1) + ((u64)b.dword[i] &gt;&gt; 1) +
+                 ((a.dword[i] | b.dword[i]) &amp; 1);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_60">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vdiv_b-__m128i-a-__m128i-b">__m128i __lsx_vdiv_b (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_61">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vdiv_b (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vdiv.b vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_61">Description</h3>
+<p>Divide signed 8-bit elements in <code>a</code> by elements in <code>b</code>.</p>
+<h3 id="operation_61">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = (b.byte[i] == 0) ? 0 : ((s8)a.byte[i] / (s8)b.byte[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_61">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>29, 32</td>
+<td>0.06(1/15.5)</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>29, 32</td>
+<td>0.06(1/17)</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vdiv_bu-__m128i-a-__m128i-b">__m128i __lsx_vdiv_bu (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_62">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vdiv_bu (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vdiv.bu vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_62">Description</h3>
+<p>Divide unsigned 8-bit elements in <code>a</code> by elements in <code>b</code>.</p>
+<h3 id="operation_62">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = (b.byte[i] == 0) ? 0 : ((u8)a.byte[i] / (u8)b.byte[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_62">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>29, 33</td>
+<td>0.06(1/16.5)</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>29, 36</td>
+<td>0.06(1/18)</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vdiv_h-__m128i-a-__m128i-b">__m128i __lsx_vdiv_h (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_63">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vdiv_h (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vdiv.h vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_63">Description</h3>
+<p>Divide signed 16-bit elements in <code>a</code> by elements in <code>b</code>.</p>
+<h3 id="operation_63">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = (b.half[i] == 0) ? 0 : ((s16)a.half[i] / (s16)b.half[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_63">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>17</td>
+<td>0.12(1/8.5)</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>17, 21.5</td>
+<td>0.09(1/11)</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vdiv_hu-__m128i-a-__m128i-b">__m128i __lsx_vdiv_hu (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_64">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vdiv_hu (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vdiv.hu vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_64">Description</h3>
+<p>Divide unsigned 16-bit elements in <code>a</code> by elements in <code>b</code>.</p>
+<h3 id="operation_64">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = (b.half[i] == 0) ? 0 : ((u16)a.half[i] / (u16)b.half[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_64">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>17, 22</td>
+<td>0.11(1/9)</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>17, 21.5</td>
+<td>0.07(1/14)</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vdiv_w-__m128i-a-__m128i-b">__m128i __lsx_vdiv_w (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_65">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vdiv_w (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vdiv.w vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_65">Description</h3>
+<p>Divide signed 32-bit elements in <code>a</code> by elements in <code>b</code>.</p>
+<h3 id="operation_65">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = (b.word[i] == 0) ? 0 : ((s32)a.word[i] / (s32)b.word[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_65">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>11</td>
+<td>0.18(1/5.5)</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>11, 17.5</td>
+<td>0.09(1/11.5)</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vdiv_wu-__m128i-a-__m128i-b">__m128i __lsx_vdiv_wu (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_66">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vdiv_wu (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vdiv.wu vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_66">Description</h3>
+<p>Divide unsigned 32-bit elements in <code>a</code> by elements in <code>b</code>.</p>
+<h3 id="operation_66">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = (b.word[i] == 0) ? 0 : ((u32)a.word[i] / (u32)b.word[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_66">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>11</td>
+<td>0.18(1/5.5)</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>11, 17.5</td>
+<td>0.07(1/15)</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vdiv_d-__m128i-a-__m128i-b">__m128i __lsx_vdiv_d (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_67">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vdiv_d (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vdiv.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_67">Description</h3>
+<p>Divide signed 64-bit elements in <code>a</code> by elements in <code>b</code>.</p>
+<h3 id="operation_67">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = (b.dword[i] == 0) ? 0 : ((s64)a.dword[i] / (s64)b.dword[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_67">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>8</td>
+<td>0.25(1/4)</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>8, 18.5</td>
+<td>0.11(1/9)</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vdiv_du-__m128i-a-__m128i-b">__m128i __lsx_vdiv_du (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_68">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vdiv_du (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vdiv.du vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_68">Description</h3>
+<p>Divide unsigned 64-bit elements in <code>a</code> by elements in <code>b</code>.</p>
+<h3 id="operation_68">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = (b.dword[i] == 0) ? 0 : ((u64)a.dword[i] / (u64)b.dword[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_68">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>8</td>
+<td>0.25(1/4)</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>8, 18.5</td>
+<td>0.11(1/9)</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vhaddw_h_b-__m128i-a-__m128i-b">__m128i __lsx_vhaddw_h_b (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_69">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vhaddw_h_b (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vhaddw.h.b vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_69">Description</h3>
+<p>Add odd-positioned signed 8-bit elements in <code>a</code> to even-positioned signed 8-bit elements in <code>b</code> to get 16-bit result.</p>
+<h3 id="operation_69">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = (s16)(s8)a.byte[2 * i + 1] + (s16)(s8)b.byte[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_69">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vhaddw_hu_bu-__m128i-a-__m128i-b">__m128i __lsx_vhaddw_hu_bu (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_70">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vhaddw_hu_bu (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vhaddw.hu.bu vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_70">Description</h3>
+<p>Add odd-positioned unsigned 8-bit elements in <code>a</code> to even-positioned unsigned 8-bit elements in <code>b</code> to get 16-bit result.</p>
+<h3 id="operation_70">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = (u16)(u8)a.byte[2 * i + 1] + (u16)(u8)b.byte[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_70">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vhaddw_w_h-__m128i-a-__m128i-b">__m128i __lsx_vhaddw_w_h (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_71">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vhaddw_w_h (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vhaddw.w.h vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_71">Description</h3>
+<p>Add odd-positioned signed 16-bit elements in <code>a</code> to even-positioned signed 16-bit elements in <code>b</code> to get 32-bit result.</p>
+<h3 id="operation_71">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = (s32)(s16)a.half[2 * i + 1] + (s32)(s16)b.half[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_71">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vhaddw_wu_hu-__m128i-a-__m128i-b">__m128i __lsx_vhaddw_wu_hu (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_72">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vhaddw_wu_hu (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vhaddw.wu.hu vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_72">Description</h3>
+<p>Add odd-positioned unsigned 16-bit elements in <code>a</code> to even-positioned unsigned 16-bit elements in <code>b</code> to get 32-bit result.</p>
+<h3 id="operation_72">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = (u32)(u16)a.half[2 * i + 1] + (u32)(u16)b.half[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_72">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vhaddw_d_w-__m128i-a-__m128i-b">__m128i __lsx_vhaddw_d_w (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_73">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vhaddw_d_w (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vhaddw.d.w vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_73">Description</h3>
+<p>Add odd-positioned signed 32-bit elements in <code>a</code> to even-positioned signed 32-bit elements in <code>b</code> to get 64-bit result.</p>
+<h3 id="operation_73">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = (s64)(s32)a.word[2 * i + 1] + (s64)(s32)b.word[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_73">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vhaddw_du_wu-__m128i-a-__m128i-b">__m128i __lsx_vhaddw_du_wu (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_74">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vhaddw_du_wu (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vhaddw.du.wu vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_74">Description</h3>
+<p>Add odd-positioned unsigned 32-bit elements in <code>a</code> to even-positioned unsigned 32-bit elements in <code>b</code> to get 64-bit result.</p>
+<h3 id="operation_74">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = (u64)(u32)a.word[2 * i + 1] + (u64)(u32)b.word[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_74">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vhaddw_q_d-__m128i-a-__m128i-b">__m128i __lsx_vhaddw_q_d (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_75">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vhaddw_q_d (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vhaddw.q.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_75">Description</h3>
+<p>Add odd-positioned signed 64-bit elements in <code>a</code> to even-positioned signed 64-bit elements in <code>b</code> to get 128-bit result.</p>
+<h3 id="operation_75">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 1; i++) {
+  dst.qword[i] = (s128)(s64)a.dword[2 * i + 1] + (s128)(s64)b.dword[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_75">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vhaddw_qu_du-__m128i-a-__m128i-b">__m128i __lsx_vhaddw_qu_du (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_76">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vhaddw_qu_du (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vhaddw.qu.du vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_76">Description</h3>
+<p>Add odd-positioned unsigned 64-bit elements in <code>a</code> to even-positioned unsigned 64-bit elements in <code>b</code> to get 128-bit result.</p>
+<h3 id="operation_76">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 1; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] + (u128)(u64)b.dword[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_76">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vhsubw_h_b-__m128i-a-__m128i-b">__m128i __lsx_vhsubw_h_b (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_77">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vhsubw_h_b (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vhsubw.h.b vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_77">Description</h3>
+<p>Subtract odd-positioned signed 8-bit elements in <code>a</code> by even-positioned signed 8-bit elements in <code>b</code> to get 16-bit result.</p>
+<h3 id="operation_77">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = (s16)(s8)a.byte[2 * i + 1] - (s16)(s8)b.byte[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_77">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vhsubw_hu_bu-__m128i-a-__m128i-b">__m128i __lsx_vhsubw_hu_bu (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_78">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vhsubw_hu_bu (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vhsubw.hu.bu vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_78">Description</h3>
+<p>Subtract odd-positioned unsigned 8-bit elements in <code>a</code> by even-positioned unsigned 8-bit elements in <code>b</code> to get 16-bit result.</p>
+<h3 id="operation_78">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = (u16)(u8)a.byte[2 * i + 1] - (u16)(u8)b.byte[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_78">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vhsubw_w_h-__m128i-a-__m128i-b">__m128i __lsx_vhsubw_w_h (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_79">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vhsubw_w_h (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vhsubw.w.h vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_79">Description</h3>
+<p>Subtract odd-positioned signed 16-bit elements in <code>a</code> by even-positioned signed 16-bit elements in <code>b</code> to get 32-bit result.</p>
+<h3 id="operation_79">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = (s32)(s16)a.half[2 * i + 1] - (s32)(s16)b.half[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_79">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vhsubw_wu_hu-__m128i-a-__m128i-b">__m128i __lsx_vhsubw_wu_hu (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_80">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vhsubw_wu_hu (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vhsubw.wu.hu vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_80">Description</h3>
+<p>Subtract odd-positioned unsigned 16-bit elements in <code>a</code> by even-positioned unsigned 16-bit elements in <code>b</code> to get 32-bit result.</p>
+<h3 id="operation_80">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = (u32)(u16)a.half[2 * i + 1] - (u32)(u16)b.half[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_80">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vhsubw_d_w-__m128i-a-__m128i-b">__m128i __lsx_vhsubw_d_w (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_81">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vhsubw_d_w (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vhsubw.d.w vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_81">Description</h3>
+<p>Subtract odd-positioned signed 32-bit elements in <code>a</code> by even-positioned signed 32-bit elements in <code>b</code> to get 64-bit result.</p>
+<h3 id="operation_81">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = (s64)(s32)a.word[2 * i + 1] - (s64)(s32)b.word[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_81">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vhsubw_du_wu-__m128i-a-__m128i-b">__m128i __lsx_vhsubw_du_wu (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_82">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vhsubw_du_wu (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vhsubw.du.wu vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_82">Description</h3>
+<p>Subtract odd-positioned unsigned 32-bit elements in <code>a</code> by even-positioned unsigned 32-bit elements in <code>b</code> to get 64-bit result.</p>
+<h3 id="operation_82">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = (u64)(u32)a.word[2 * i + 1] - (u64)(u32)b.word[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_82">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vhsubw_q_d-__m128i-a-__m128i-b">__m128i __lsx_vhsubw_q_d (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_83">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vhsubw_q_d (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vhsubw.q.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_83">Description</h3>
+<p>Subtract odd-positioned signed 64-bit elements in <code>a</code> by even-positioned signed 64-bit elements in <code>b</code> to get 128-bit result.</p>
+<h3 id="operation_83">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 1; i++) {
+  dst.qword[i] = (s128)(s64)a.dword[2 * i + 1] - (s128)(s64)b.dword[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_83">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vhsubw_qu_du-__m128i-a-__m128i-b">__m128i __lsx_vhsubw_qu_du (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_84">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vhsubw_qu_du (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vhsubw.qu.du vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_84">Description</h3>
+<p>Subtract odd-positioned unsigned 64-bit elements in <code>a</code> by even-positioned unsigned 64-bit elements in <code>b</code> to get 128-bit result.</p>
+<h3 id="operation_84">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 1; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] - (u128)(u64)b.dword[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_84">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmadd_b-__m128i-a-__m128i-b-__m128i-c">__m128i __lsx_vmadd_b (__m128i a, __m128i b, __m128i c)</h2>
+<h3 id="synopsis_85">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmadd_b (__m128i a, __m128i b, __m128i c)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmadd.b vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_85">Description</h3>
+<p>Multiply 8-bit elements in <code>b</code> and <code>c</code>, add to elements in <code>a</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_85">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = b.byte[i] * c.byte[i] + a.byte[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_85">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmadd_h-__m128i-a-__m128i-b-__m128i-c">__m128i __lsx_vmadd_h (__m128i a, __m128i b, __m128i c)</h2>
+<h3 id="synopsis_86">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmadd_h (__m128i a, __m128i b, __m128i c)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmadd.h vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_86">Description</h3>
+<p>Multiply 16-bit elements in <code>b</code> and <code>c</code>, add to elements in <code>a</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_86">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = b.half[i] * c.half[i] + a.half[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_86">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmadd_w-__m128i-a-__m128i-b-__m128i-c">__m128i __lsx_vmadd_w (__m128i a, __m128i b, __m128i c)</h2>
+<h3 id="synopsis_87">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmadd_w (__m128i a, __m128i b, __m128i c)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmadd.w vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_87">Description</h3>
+<p>Multiply 32-bit elements in <code>b</code> and <code>c</code>, add to elements in <code>a</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_87">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = b.word[i] * c.word[i] + a.word[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_87">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmadd_d-__m128i-a-__m128i-b-__m128i-c">__m128i __lsx_vmadd_d (__m128i a, __m128i b, __m128i c)</h2>
+<h3 id="synopsis_88">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmadd_d (__m128i a, __m128i b, __m128i c)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmadd.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_88">Description</h3>
+<p>Multiply 64-bit elements in <code>b</code> and <code>c</code>, add to elements in <code>a</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_88">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = b.dword[i] * c.dword[i] + a.dword[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_88">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmaddwev_h_b-__m128i-a-__m128i-b-__m128i-c">__m128i __lsx_vmaddwev_h_b (__m128i a, __m128i b, __m128i c)</h2>
+<h3 id="synopsis_89">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmaddwev_h_b (__m128i a, __m128i b, __m128i c)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmaddwev.h.b vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_89">Description</h3>
+<p>Multiply even-positioned signed 8-bit elements in <code>b</code> and signed elements in <code>c</code>, add to 16-bit elements in <code>a</code>.</p>
+<h3 id="operation_89">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] =
+      (s16)(s8)b.byte[2 * i] * (s16)(s8)c.byte[2 * i] + (s16)a.half[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_89">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmaddwev_h_bu-__m128i-a-__m128i-b-__m128i-c">__m128i __lsx_vmaddwev_h_bu (__m128i a, __m128i b, __m128i c)</h2>
+<h3 id="synopsis_90">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmaddwev_h_bu (__m128i a, __m128i b, __m128i c)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmaddwev.h.bu vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_90">Description</h3>
+<p>Multiply even-positioned unsigned 8-bit elements in <code>b</code> and unsigned elements in <code>c</code>, add to 16-bit elements in <code>a</code>.</p>
+<h3 id="operation_90">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] =
+      (u16)(u8)b.byte[2 * i] * (u16)(u8)c.byte[2 * i] + (u16)a.half[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_90">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmaddwev_h_bu_b-__m128i-a-__m128i-b-__m128i-c">__m128i __lsx_vmaddwev_h_bu_b (__m128i a, __m128i b, __m128i c)</h2>
+<h3 id="synopsis_91">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmaddwev_h_bu_b (__m128i a, __m128i b, __m128i c)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmaddwev.h.bu.b vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_91">Description</h3>
+<p>Multiply even-positioned unsigned 8-bit elements in <code>b</code> and signed elements in <code>c</code>, add to 16-bit elements in <code>a</code>.</p>
+<h3 id="operation_91">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] =
+      (u16)(u8)b.byte[2 * i] * (s16)(s8)c.byte[2 * i] + (s16)a.half[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_91">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmaddwev_w_h-__m128i-a-__m128i-b-__m128i-c">__m128i __lsx_vmaddwev_w_h (__m128i a, __m128i b, __m128i c)</h2>
+<h3 id="synopsis_92">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmaddwev_w_h (__m128i a, __m128i b, __m128i c)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmaddwev.w.h vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_92">Description</h3>
+<p>Multiply even-positioned signed 16-bit elements in <code>b</code> and signed elements in <code>c</code>, add to 32-bit elements in <code>a</code>.</p>
+<h3 id="operation_92">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] =
+      (s32)(s16)b.half[2 * i] * (s32)(s16)c.half[2 * i] + (s32)a.word[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_92">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmaddwev_w_hu-__m128i-a-__m128i-b-__m128i-c">__m128i __lsx_vmaddwev_w_hu (__m128i a, __m128i b, __m128i c)</h2>
+<h3 id="synopsis_93">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmaddwev_w_hu (__m128i a, __m128i b, __m128i c)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmaddwev.w.hu vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_93">Description</h3>
+<p>Multiply even-positioned unsigned 16-bit elements in <code>b</code> and unsigned elements in <code>c</code>, add to 32-bit elements in <code>a</code>.</p>
+<h3 id="operation_93">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] =
+      (u32)(u16)b.half[2 * i] * (u32)(u16)c.half[2 * i] + (u32)a.word[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_93">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmaddwev_w_hu_h-__m128i-a-__m128i-b-__m128i-c">__m128i __lsx_vmaddwev_w_hu_h (__m128i a, __m128i b, __m128i c)</h2>
+<h3 id="synopsis_94">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmaddwev_w_hu_h (__m128i a, __m128i b, __m128i c)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmaddwev.w.hu.h vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_94">Description</h3>
+<p>Multiply even-positioned unsigned 16-bit elements in <code>b</code> and signed elements in <code>c</code>, add to 32-bit elements in <code>a</code>.</p>
+<h3 id="operation_94">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] =
+      (u32)(u16)b.half[2 * i] * (s32)(s16)c.half[2 * i] + (s32)a.word[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_94">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmaddwev_d_w-__m128i-a-__m128i-b-__m128i-c">__m128i __lsx_vmaddwev_d_w (__m128i a, __m128i b, __m128i c)</h2>
+<h3 id="synopsis_95">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmaddwev_d_w (__m128i a, __m128i b, __m128i c)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmaddwev.d.w vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_95">Description</h3>
+<p>Multiply even-positioned signed 32-bit elements in <code>b</code> and signed elements in <code>c</code>, add to 64-bit elements in <code>a</code>.</p>
+<h3 id="operation_95">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] =
+      (s64)(s32)b.word[2 * i] * (s64)(s32)c.word[2 * i] + (s64)a.dword[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_95">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmaddwev_d_wu-__m128i-a-__m128i-b-__m128i-c">__m128i __lsx_vmaddwev_d_wu (__m128i a, __m128i b, __m128i c)</h2>
+<h3 id="synopsis_96">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmaddwev_d_wu (__m128i a, __m128i b, __m128i c)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmaddwev.d.wu vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_96">Description</h3>
+<p>Multiply even-positioned unsigned 32-bit elements in <code>b</code> and unsigned elements in <code>c</code>, add to 64-bit elements in <code>a</code>.</p>
+<h3 id="operation_96">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] =
+      (u64)(u32)b.word[2 * i] * (u64)(u32)c.word[2 * i] + (u64)a.dword[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_96">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmaddwev_d_wu_w-__m128i-a-__m128i-b-__m128i-c">__m128i __lsx_vmaddwev_d_wu_w (__m128i a, __m128i b, __m128i c)</h2>
+<h3 id="synopsis_97">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmaddwev_d_wu_w (__m128i a, __m128i b, __m128i c)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmaddwev.d.wu.w vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_97">Description</h3>
+<p>Multiply even-positioned unsigned 32-bit elements in <code>b</code> and signed elements in <code>c</code>, add to 64-bit elements in <code>a</code>.</p>
+<h3 id="operation_97">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] =
+      (u64)(u32)b.word[2 * i] * (s64)(s32)c.word[2 * i] + (s64)a.dword[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_97">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmaddwev_q_d-__m128i-a-__m128i-b-__m128i-c">__m128i __lsx_vmaddwev_q_d (__m128i a, __m128i b, __m128i c)</h2>
+<h3 id="synopsis_98">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmaddwev_q_d (__m128i a, __m128i b, __m128i c)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmaddwev.q.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_98">Description</h3>
+<p>Multiply even-positioned signed 64-bit elements in <code>b</code> and signed elements in <code>c</code>, add to 128-bit elements in <code>a</code>.</p>
+<h3 id="operation_98">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 1; i++) {
+  dst.qword[i] =
+      (s128)(s64)b.dword[2 * i] * (s128)(s64)c.dword[2 * i] + (s128)a.qword[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_98">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>7</td>
+<td>1.14</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>7</td>
+<td>1.14</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmaddwev_q_du-__m128i-a-__m128i-b-__m128i-c">__m128i __lsx_vmaddwev_q_du (__m128i a, __m128i b, __m128i c)</h2>
+<h3 id="synopsis_99">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmaddwev_q_du (__m128i a, __m128i b, __m128i c)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmaddwev.q.du vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_99">Description</h3>
+<p>Multiply even-positioned unsigned 64-bit elements in <code>b</code> and unsigned elements in <code>c</code>, add to 128-bit elements in <code>a</code>.</p>
+<h3 id="operation_99">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 1; i++) {
+  dst.qword[i] =
+      (u128)(u64)b.dword[2 * i] * (u128)(u64)c.dword[2 * i] + (u128)a.qword[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_99">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>7</td>
+<td>1.14</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>7</td>
+<td>1.14</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmaddwev_q_du_d-__m128i-a-__m128i-b-__m128i-c">__m128i __lsx_vmaddwev_q_du_d (__m128i a, __m128i b, __m128i c)</h2>
+<h3 id="synopsis_100">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmaddwev_q_du_d (__m128i a, __m128i b, __m128i c)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmaddwev.q.du.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_100">Description</h3>
+<p>Multiply even-positioned unsigned 64-bit elements in <code>b</code> and signed elements in <code>c</code>, add to 128-bit elements in <code>a</code>.</p>
+<h3 id="operation_100">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 1; i++) {
+  dst.qword[i] =
+      (u128)(u64)b.dword[2 * i] * (s128)(s64)c.dword[2 * i] + (s128)a.qword[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_100">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>7</td>
+<td>1.14</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>7</td>
+<td>1.14</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmaddwod_h_b-__m128i-a-__m128i-b-__m128i-c">__m128i __lsx_vmaddwod_h_b (__m128i a, __m128i b, __m128i c)</h2>
+<h3 id="synopsis_101">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmaddwod_h_b (__m128i a, __m128i b, __m128i c)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmaddwod.h.b vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_101">Description</h3>
+<p>Multiply odd-positioned signed 8-bit elements in <code>b</code> and signed elements in <code>c</code>, add to 16-bit elements in <code>a</code>.</p>
+<h3 id="operation_101">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] =
+      (s16)(s8)b.byte[2 * i + 1] * (s16)(s8)c.byte[2 * i + 1] + (s16)a.half[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_101">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmaddwod_h_bu-__m128i-a-__m128i-b-__m128i-c">__m128i __lsx_vmaddwod_h_bu (__m128i a, __m128i b, __m128i c)</h2>
+<h3 id="synopsis_102">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmaddwod_h_bu (__m128i a, __m128i b, __m128i c)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmaddwod.h.bu vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_102">Description</h3>
+<p>Multiply odd-positioned unsigned 8-bit elements in <code>b</code> and unsigned elements in <code>c</code>, add to 16-bit elements in <code>a</code>.</p>
+<h3 id="operation_102">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] =
+      (u16)(u8)b.byte[2 * i + 1] * (u16)(u8)c.byte[2 * i + 1] + (u16)a.half[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_102">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmaddwod_h_bu_b-__m128i-a-__m128i-b-__m128i-c">__m128i __lsx_vmaddwod_h_bu_b (__m128i a, __m128i b, __m128i c)</h2>
+<h3 id="synopsis_103">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmaddwod_h_bu_b (__m128i a, __m128i b, __m128i c)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmaddwod.h.bu.b vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_103">Description</h3>
+<p>Multiply odd-positioned unsigned 8-bit elements in <code>b</code> and signed elements in <code>c</code>, add to 16-bit elements in <code>a</code>.</p>
+<h3 id="operation_103">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] =
+      (u16)(u8)b.byte[2 * i + 1] * (s16)(s8)c.byte[2 * i + 1] + (s16)a.half[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_103">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmaddwod_w_h-__m128i-a-__m128i-b-__m128i-c">__m128i __lsx_vmaddwod_w_h (__m128i a, __m128i b, __m128i c)</h2>
+<h3 id="synopsis_104">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmaddwod_w_h (__m128i a, __m128i b, __m128i c)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmaddwod.w.h vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_104">Description</h3>
+<p>Multiply odd-positioned signed 16-bit elements in <code>b</code> and signed elements in <code>c</code>, add to 32-bit elements in <code>a</code>.</p>
+<h3 id="operation_104">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = (s32)(s16)b.half[2 * i + 1] * (s32)(s16)c.half[2 * i + 1] +
+                (s32)a.word[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_104">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmaddwod_w_hu-__m128i-a-__m128i-b-__m128i-c">__m128i __lsx_vmaddwod_w_hu (__m128i a, __m128i b, __m128i c)</h2>
+<h3 id="synopsis_105">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmaddwod_w_hu (__m128i a, __m128i b, __m128i c)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmaddwod.w.hu vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_105">Description</h3>
+<p>Multiply odd-positioned unsigned 16-bit elements in <code>b</code> and unsigned elements in <code>c</code>, add to 32-bit elements in <code>a</code>.</p>
+<h3 id="operation_105">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = (u32)(u16)b.half[2 * i + 1] * (u32)(u16)c.half[2 * i + 1] +
+                (u32)a.word[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_105">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmaddwod_w_hu_h-__m128i-a-__m128i-b-__m128i-c">__m128i __lsx_vmaddwod_w_hu_h (__m128i a, __m128i b, __m128i c)</h2>
+<h3 id="synopsis_106">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmaddwod_w_hu_h (__m128i a, __m128i b, __m128i c)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmaddwod.w.hu.h vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_106">Description</h3>
+<p>Multiply odd-positioned unsigned 16-bit elements in <code>b</code> and signed elements in <code>c</code>, add to 32-bit elements in <code>a</code>.</p>
+<h3 id="operation_106">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = (u32)(u16)b.half[2 * i + 1] * (s32)(s16)c.half[2 * i + 1] +
+                (s32)a.word[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_106">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmaddwod_d_w-__m128i-a-__m128i-b-__m128i-c">__m128i __lsx_vmaddwod_d_w (__m128i a, __m128i b, __m128i c)</h2>
+<h3 id="synopsis_107">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmaddwod_d_w (__m128i a, __m128i b, __m128i c)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmaddwod.d.w vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_107">Description</h3>
+<p>Multiply odd-positioned signed 32-bit elements in <code>b</code> and signed elements in <code>c</code>, add to 64-bit elements in <code>a</code>.</p>
+<h3 id="operation_107">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = (s64)(s32)b.word[2 * i + 1] * (s64)(s32)c.word[2 * i + 1] +
+                 (s64)a.dword[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_107">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmaddwod_d_wu-__m128i-a-__m128i-b-__m128i-c">__m128i __lsx_vmaddwod_d_wu (__m128i a, __m128i b, __m128i c)</h2>
+<h3 id="synopsis_108">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmaddwod_d_wu (__m128i a, __m128i b, __m128i c)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmaddwod.d.wu vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_108">Description</h3>
+<p>Multiply odd-positioned unsigned 32-bit elements in <code>b</code> and unsigned elements in <code>c</code>, add to 64-bit elements in <code>a</code>.</p>
+<h3 id="operation_108">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = (u64)(u32)b.word[2 * i + 1] * (u64)(u32)c.word[2 * i + 1] +
+                 (u64)a.dword[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_108">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmaddwod_d_wu_w-__m128i-a-__m128i-b-__m128i-c">__m128i __lsx_vmaddwod_d_wu_w (__m128i a, __m128i b, __m128i c)</h2>
+<h3 id="synopsis_109">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmaddwod_d_wu_w (__m128i a, __m128i b, __m128i c)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmaddwod.d.wu.w vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_109">Description</h3>
+<p>Multiply odd-positioned unsigned 32-bit elements in <code>b</code> and signed elements in <code>c</code>, add to 64-bit elements in <code>a</code>.</p>
+<h3 id="operation_109">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = (u64)(u32)b.word[2 * i + 1] * (s64)(s32)c.word[2 * i + 1] +
+                 (s64)a.dword[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_109">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmaddwod_q_d-__m128i-a-__m128i-b-__m128i-c">__m128i __lsx_vmaddwod_q_d (__m128i a, __m128i b, __m128i c)</h2>
+<h3 id="synopsis_110">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmaddwod_q_d (__m128i a, __m128i b, __m128i c)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmaddwod.q.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_110">Description</h3>
+<p>Multiply odd-positioned signed 64-bit elements in <code>b</code> and signed elements in <code>c</code>, add to 128-bit elements in <code>a</code>.</p>
+<h3 id="operation_110">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 1; i++) {
+  dst.qword[i] = (s128)(s64)b.dword[2 * i + 1] * (s128)(s64)c.dword[2 * i + 1] +
+                 (s128)a.qword[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_110">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>7</td>
+<td>1.14</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>7</td>
+<td>1.14</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmaddwod_q_du-__m128i-a-__m128i-b-__m128i-c">__m128i __lsx_vmaddwod_q_du (__m128i a, __m128i b, __m128i c)</h2>
+<h3 id="synopsis_111">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmaddwod_q_du (__m128i a, __m128i b, __m128i c)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmaddwod.q.du vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_111">Description</h3>
+<p>Multiply odd-positioned unsigned 64-bit elements in <code>b</code> and unsigned elements in <code>c</code>, add to 128-bit elements in <code>a</code>.</p>
+<h3 id="operation_111">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 1; i++) {
+  dst.qword[i] = (u128)(u64)b.dword[2 * i + 1] * (u128)(u64)c.dword[2 * i + 1] +
+                 (u128)a.qword[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_111">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>7</td>
+<td>1.14</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>7</td>
+<td>1.14</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmaddwod_q_du_d-__m128i-a-__m128i-b-__m128i-c">__m128i __lsx_vmaddwod_q_du_d (__m128i a, __m128i b, __m128i c)</h2>
+<h3 id="synopsis_112">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmaddwod_q_du_d (__m128i a, __m128i b, __m128i c)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmaddwod.q.du.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_112">Description</h3>
+<p>Multiply odd-positioned unsigned 64-bit elements in <code>b</code> and signed elements in <code>c</code>, add to 128-bit elements in <code>a</code>.</p>
+<h3 id="operation_112">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 1; i++) {
+  dst.qword[i] = (u128)(u64)b.dword[2 * i + 1] * (s128)(s64)c.dword[2 * i + 1] +
+                 (s128)a.qword[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_112">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>7</td>
+<td>1.14</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>7</td>
+<td>1.14</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmax_b-__m128i-a-__m128i-b">__m128i __lsx_vmax_b (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_113">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmax_b (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmax.b vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_113">Description</h3>
+<p>Compute elementwise maximum for signed 8-bit elements in <code>a</code> and <code>b</code>.</p>
+<h3 id="operation_113">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = max((s8)a.byte[i], (s8)b.byte[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_113">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmax_bu-__m128i-a-__m128i-b">__m128i __lsx_vmax_bu (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_114">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmax_bu (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmax.bu vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_114">Description</h3>
+<p>Compute elementwise maximum for unsigned 8-bit elements in <code>a</code> and <code>b</code>.</p>
+<h3 id="operation_114">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = max((u8)a.byte[i], (u8)b.byte[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_114">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmax_h-__m128i-a-__m128i-b">__m128i __lsx_vmax_h (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_115">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmax_h (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmax.h vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_115">Description</h3>
+<p>Compute elementwise maximum for signed 16-bit elements in <code>a</code> and <code>b</code>.</p>
+<h3 id="operation_115">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = max((s16)a.half[i], (s16)b.half[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_115">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmax_hu-__m128i-a-__m128i-b">__m128i __lsx_vmax_hu (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_116">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmax_hu (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmax.hu vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_116">Description</h3>
+<p>Compute elementwise maximum for unsigned 16-bit elements in <code>a</code> and <code>b</code>.</p>
+<h3 id="operation_116">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = max((u16)a.half[i], (u16)b.half[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_116">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmax_w-__m128i-a-__m128i-b">__m128i __lsx_vmax_w (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_117">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmax_w (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmax.w vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_117">Description</h3>
+<p>Compute elementwise maximum for signed 32-bit elements in <code>a</code> and <code>b</code>.</p>
+<h3 id="operation_117">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = max((s32)a.word[i], (s32)b.word[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_117">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmax_wu-__m128i-a-__m128i-b">__m128i __lsx_vmax_wu (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_118">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmax_wu (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmax.wu vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_118">Description</h3>
+<p>Compute elementwise maximum for unsigned 32-bit elements in <code>a</code> and <code>b</code>.</p>
+<h3 id="operation_118">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = max((u32)a.word[i], (u32)b.word[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_118">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmax_d-__m128i-a-__m128i-b">__m128i __lsx_vmax_d (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_119">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmax_d (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmax.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_119">Description</h3>
+<p>Compute elementwise maximum for signed 64-bit elements in <code>a</code> and <code>b</code>.</p>
+<h3 id="operation_119">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = max((s64)a.dword[i], (s64)b.dword[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_119">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmax_du-__m128i-a-__m128i-b">__m128i __lsx_vmax_du (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_120">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmax_du (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmax.du vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_120">Description</h3>
+<p>Compute elementwise maximum for unsigned 64-bit elements in <code>a</code> and <code>b</code>.</p>
+<h3 id="operation_120">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = max((u64)a.dword[i], (u64)b.dword[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_120">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmaxi_b-__m128i-a-imm_n16_15-imm">__m128i __lsx_vmaxi_b (__m128i a, imm_n16_15 imm)</h2>
+<h3 id="synopsis_121">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmaxi_b (__m128i a, imm_n16_15 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmaxi.b vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_121">Description</h3>
+<p>Compute elementwise maximum for signed 8-bit elements in <code>a</code> and <code>imm</code>.</p>
+<h3 id="operation_121">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = max((s8)a.byte[i], (s8)imm);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_121">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmaxi_bu-__m128i-a-imm0_31-imm">__m128i __lsx_vmaxi_bu (__m128i a, imm0_31 imm)</h2>
+<h3 id="synopsis_122">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmaxi_bu (__m128i a, imm0_31 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmaxi.bu vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_122">Description</h3>
+<p>Compute elementwise maximum for unsigned 8-bit elements in <code>a</code> and <code>imm</code>.</p>
+<h3 id="operation_122">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = max((u8)a.byte[i], (u8)imm);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_122">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmaxi_h-__m128i-a-imm_n16_15-imm">__m128i __lsx_vmaxi_h (__m128i a, imm_n16_15 imm)</h2>
+<h3 id="synopsis_123">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmaxi_h (__m128i a, imm_n16_15 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmaxi.h vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_123">Description</h3>
+<p>Compute elementwise maximum for signed 16-bit elements in <code>a</code> and <code>imm</code>.</p>
+<h3 id="operation_123">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = max((s16)a.half[i], (s16)imm);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_123">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmaxi_hu-__m128i-a-imm0_31-imm">__m128i __lsx_vmaxi_hu (__m128i a, imm0_31 imm)</h2>
+<h3 id="synopsis_124">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmaxi_hu (__m128i a, imm0_31 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmaxi.hu vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_124">Description</h3>
+<p>Compute elementwise maximum for unsigned 16-bit elements in <code>a</code> and <code>imm</code>.</p>
+<h3 id="operation_124">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = max((u16)a.half[i], (u16)imm);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_124">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmaxi_w-__m128i-a-imm_n16_15-imm">__m128i __lsx_vmaxi_w (__m128i a, imm_n16_15 imm)</h2>
+<h3 id="synopsis_125">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmaxi_w (__m128i a, imm_n16_15 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmaxi.w vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_125">Description</h3>
+<p>Compute elementwise maximum for signed 32-bit elements in <code>a</code> and <code>imm</code>.</p>
+<h3 id="operation_125">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = max((s32)a.word[i], (s32)imm);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_125">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmaxi_wu-__m128i-a-imm0_31-imm">__m128i __lsx_vmaxi_wu (__m128i a, imm0_31 imm)</h2>
+<h3 id="synopsis_126">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmaxi_wu (__m128i a, imm0_31 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmaxi.wu vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_126">Description</h3>
+<p>Compute elementwise maximum for unsigned 32-bit elements in <code>a</code> and <code>imm</code>.</p>
+<h3 id="operation_126">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = max((u32)a.word[i], (u32)imm);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_126">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmaxi_d-__m128i-a-imm_n16_15-imm">__m128i __lsx_vmaxi_d (__m128i a, imm_n16_15 imm)</h2>
+<h3 id="synopsis_127">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmaxi_d (__m128i a, imm_n16_15 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmaxi.d vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_127">Description</h3>
+<p>Compute elementwise maximum for signed 64-bit elements in <code>a</code> and <code>imm</code>.</p>
+<h3 id="operation_127">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = max((s64)a.dword[i], (s64)imm);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_127">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmaxi_du-__m128i-a-imm0_31-imm">__m128i __lsx_vmaxi_du (__m128i a, imm0_31 imm)</h2>
+<h3 id="synopsis_128">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmaxi_du (__m128i a, imm0_31 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmaxi.du vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_128">Description</h3>
+<p>Compute elementwise maximum for unsigned 64-bit elements in <code>a</code> and <code>imm</code>.</p>
+<h3 id="operation_128">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = max((u64)a.dword[i], (u64)imm);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_128">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmin_b-__m128i-a-__m128i-b">__m128i __lsx_vmin_b (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_129">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmin_b (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmin.b vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_129">Description</h3>
+<p>Compute elementwise minimum for signed 8-bit elements in <code>a</code> and <code>b</code>.</p>
+<h3 id="operation_129">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = min((s8)a.byte[i], (s8)b.byte[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_129">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmin_bu-__m128i-a-__m128i-b">__m128i __lsx_vmin_bu (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_130">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmin_bu (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmin.bu vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_130">Description</h3>
+<p>Compute elementwise minimum for unsigned 8-bit elements in <code>a</code> and <code>b</code>.</p>
+<h3 id="operation_130">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = min((u8)a.byte[i], (u8)b.byte[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_130">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmin_h-__m128i-a-__m128i-b">__m128i __lsx_vmin_h (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_131">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmin_h (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmin.h vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_131">Description</h3>
+<p>Compute elementwise minimum for signed 16-bit elements in <code>a</code> and <code>b</code>.</p>
+<h3 id="operation_131">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = min((s16)a.half[i], (s16)b.half[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_131">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmin_hu-__m128i-a-__m128i-b">__m128i __lsx_vmin_hu (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_132">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmin_hu (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmin.hu vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_132">Description</h3>
+<p>Compute elementwise minimum for unsigned 16-bit elements in <code>a</code> and <code>b</code>.</p>
+<h3 id="operation_132">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = min((u16)a.half[i], (u16)b.half[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_132">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmin_w-__m128i-a-__m128i-b">__m128i __lsx_vmin_w (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_133">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmin_w (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmin.w vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_133">Description</h3>
+<p>Compute elementwise minimum for signed 32-bit elements in <code>a</code> and <code>b</code>.</p>
+<h3 id="operation_133">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = min((s32)a.word[i], (s32)b.word[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_133">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmin_wu-__m128i-a-__m128i-b">__m128i __lsx_vmin_wu (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_134">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmin_wu (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmin.wu vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_134">Description</h3>
+<p>Compute elementwise minimum for unsigned 32-bit elements in <code>a</code> and <code>b</code>.</p>
+<h3 id="operation_134">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = min((u32)a.word[i], (u32)b.word[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_134">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmin_d-__m128i-a-__m128i-b">__m128i __lsx_vmin_d (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_135">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmin_d (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmin.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_135">Description</h3>
+<p>Compute elementwise minimum for signed 64-bit elements in <code>a</code> and <code>b</code>.</p>
+<h3 id="operation_135">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = min((s64)a.dword[i], (s64)b.dword[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_135">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmin_du-__m128i-a-__m128i-b">__m128i __lsx_vmin_du (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_136">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmin_du (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmin.du vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_136">Description</h3>
+<p>Compute elementwise minimum for unsigned 64-bit elements in <code>a</code> and <code>b</code>.</p>
+<h3 id="operation_136">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = min((u64)a.dword[i], (u64)b.dword[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_136">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmini_b-__m128i-a-imm_n16_15-imm">__m128i __lsx_vmini_b (__m128i a, imm_n16_15 imm)</h2>
+<h3 id="synopsis_137">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmini_b (__m128i a, imm_n16_15 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmini.b vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_137">Description</h3>
+<p>Compute elementwise minimum for signed 8-bit elements in <code>a</code> and <code>imm</code>.</p>
+<h3 id="operation_137">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = min((s8)a.byte[i], (s8)imm);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_137">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmini_bu-__m128i-a-imm0_31-imm">__m128i __lsx_vmini_bu (__m128i a, imm0_31 imm)</h2>
+<h3 id="synopsis_138">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmini_bu (__m128i a, imm0_31 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmini.bu vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_138">Description</h3>
+<p>Compute elementwise minimum for unsigned 8-bit elements in <code>a</code> and <code>imm</code>.</p>
+<h3 id="operation_138">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = min((u8)a.byte[i], (u8)imm);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_138">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmini_h-__m128i-a-imm_n16_15-imm">__m128i __lsx_vmini_h (__m128i a, imm_n16_15 imm)</h2>
+<h3 id="synopsis_139">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmini_h (__m128i a, imm_n16_15 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmini.h vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_139">Description</h3>
+<p>Compute elementwise minimum for signed 16-bit elements in <code>a</code> and <code>imm</code>.</p>
+<h3 id="operation_139">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = min((s16)a.half[i], (s16)imm);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_139">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmini_hu-__m128i-a-imm0_31-imm">__m128i __lsx_vmini_hu (__m128i a, imm0_31 imm)</h2>
+<h3 id="synopsis_140">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmini_hu (__m128i a, imm0_31 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmini.hu vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_140">Description</h3>
+<p>Compute elementwise minimum for unsigned 16-bit elements in <code>a</code> and <code>imm</code>.</p>
+<h3 id="operation_140">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = min((u16)a.half[i], (u16)imm);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_140">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmini_w-__m128i-a-imm_n16_15-imm">__m128i __lsx_vmini_w (__m128i a, imm_n16_15 imm)</h2>
+<h3 id="synopsis_141">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmini_w (__m128i a, imm_n16_15 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmini.w vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_141">Description</h3>
+<p>Compute elementwise minimum for signed 32-bit elements in <code>a</code> and <code>imm</code>.</p>
+<h3 id="operation_141">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = min((s32)a.word[i], (s32)imm);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_141">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmini_wu-__m128i-a-imm0_31-imm">__m128i __lsx_vmini_wu (__m128i a, imm0_31 imm)</h2>
+<h3 id="synopsis_142">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmini_wu (__m128i a, imm0_31 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmini.wu vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_142">Description</h3>
+<p>Compute elementwise minimum for unsigned 32-bit elements in <code>a</code> and <code>imm</code>.</p>
+<h3 id="operation_142">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = min((u32)a.word[i], (u32)imm);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_142">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmini_d-__m128i-a-imm_n16_15-imm">__m128i __lsx_vmini_d (__m128i a, imm_n16_15 imm)</h2>
+<h3 id="synopsis_143">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmini_d (__m128i a, imm_n16_15 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmini.d vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_143">Description</h3>
+<p>Compute elementwise minimum for signed 64-bit elements in <code>a</code> and <code>imm</code>.</p>
+<h3 id="operation_143">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = min((s64)a.dword[i], (s64)imm);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_143">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmini_du-__m128i-a-imm0_31-imm">__m128i __lsx_vmini_du (__m128i a, imm0_31 imm)</h2>
+<h3 id="synopsis_144">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmini_du (__m128i a, imm0_31 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmini.du vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_144">Description</h3>
+<p>Compute elementwise minimum for unsigned 64-bit elements in <code>a</code> and <code>imm</code>.</p>
+<h3 id="operation_144">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = min((u64)a.dword[i], (u64)imm);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_144">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmod_b-__m128i-a-__m128i-b">__m128i __lsx_vmod_b (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_145">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmod_b (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmod.b vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_145">Description</h3>
+<p>Modulo residual signed 8-bit elements in <code>a</code> by elements in <code>b</code>.</p>
+<h3 id="operation_145">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = (b.byte[i] == 0) ? 0 : ((s8)a.byte[i] % (s8)b.byte[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_145">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>29, 35</td>
+<td>0.06(1/15.5)</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>29, 33</td>
+<td>0.06(1/17)</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmod_bu-__m128i-a-__m128i-b">__m128i __lsx_vmod_bu (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_146">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmod_bu (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmod.bu vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_146">Description</h3>
+<p>Modulo residual unsigned 8-bit elements in <code>a</code> by elements in <code>b</code>.</p>
+<h3 id="operation_146">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = (b.byte[i] == 0) ? 0 : ((u8)a.byte[i] % (u8)b.byte[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_146">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>29, 37</td>
+<td>0.06(1/17.5)</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>29, 33</td>
+<td>0.05(1/19)</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmod_h-__m128i-a-__m128i-b">__m128i __lsx_vmod_h (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_147">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmod_h (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmod.h vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_147">Description</h3>
+<p>Modulo residual signed 16-bit elements in <code>a</code> by elements in <code>b</code>.</p>
+<h3 id="operation_147">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = (b.half[i] == 0) ? 0 : ((s16)a.half[i] % (s16)b.half[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_147">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>17, 21</td>
+<td>0.12(1/8.5)</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>17, 21</td>
+<td>0.09(1/11)</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmod_hu-__m128i-a-__m128i-b">__m128i __lsx_vmod_hu (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_148">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmod_hu (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmod.hu vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_148">Description</h3>
+<p>Modulo residual unsigned 16-bit elements in <code>a</code> by elements in <code>b</code>.</p>
+<h3 id="operation_148">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = (b.half[i] == 0) ? 0 : ((u16)a.half[i] % (u16)b.half[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_148">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>17, 21</td>
+<td>0.11(1/9.5)</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>17, 21</td>
+<td>0.07(1/15)</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmod_w-__m128i-a-__m128i-b">__m128i __lsx_vmod_w (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_149">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmod_w (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmod.w vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_149">Description</h3>
+<p>Modulo residual signed 32-bit elements in <code>a</code> by elements in <code>b</code>.</p>
+<h3 id="operation_149">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = (b.word[i] == 0) ? 0 : ((s32)a.word[i] % (s32)b.word[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_149">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>11, 13</td>
+<td>0.18(1/5.5)</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>11, 15</td>
+<td>0.08(1/12)</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmod_wu-__m128i-a-__m128i-b">__m128i __lsx_vmod_wu (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_150">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmod_wu (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmod.wu vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_150">Description</h3>
+<p>Modulo residual unsigned 32-bit elements in <code>a</code> by elements in <code>b</code>.</p>
+<h3 id="operation_150">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = (b.word[i] == 0) ? 0 : ((u32)a.word[i] % (u32)b.word[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_150">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>11, 13</td>
+<td>0.18(1/5.5)</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>11, 15</td>
+<td>0.06(1/16)</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmod_d-__m128i-a-__m128i-b">__m128i __lsx_vmod_d (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_151">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmod_d (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmod.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_151">Description</h3>
+<p>Modulo residual signed 64-bit elements in <code>a</code> by elements in <code>b</code>.</p>
+<h3 id="operation_151">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = (b.dword[i] == 0) ? 0 : ((s64)a.dword[i] % (s64)b.dword[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_151">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>8, 10</td>
+<td>0.25(1/4)</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>8, 10</td>
+<td>0.11(1/9.5)</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmod_du-__m128i-a-__m128i-b">__m128i __lsx_vmod_du (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_152">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmod_du (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmod.du vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_152">Description</h3>
+<p>Modulo residual unsigned 64-bit elements in <code>a</code> by elements in <code>b</code>.</p>
+<h3 id="operation_152">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = (b.dword[i] == 0) ? 0 : ((u64)a.dword[i] % (u64)b.dword[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_152">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>8, 10</td>
+<td>0.25(1/4)</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>8, 10</td>
+<td>0.11(1/9.5)</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmsub_b-__m128i-a-__m128i-b-__m128i-c">__m128i __lsx_vmsub_b (__m128i a, __m128i b, __m128i c)</h2>
+<h3 id="synopsis_153">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmsub_b (__m128i a, __m128i b, __m128i c)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmsub.b vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_153">Description</h3>
+<p>Multiply 8-bit elements in <code>b</code> and <code>c</code>, negate and add elements in <code>a</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_153">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = -b.byte[i] * c.byte[i] + a.byte[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_153">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmsub_h-__m128i-a-__m128i-b-__m128i-c">__m128i __lsx_vmsub_h (__m128i a, __m128i b, __m128i c)</h2>
+<h3 id="synopsis_154">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmsub_h (__m128i a, __m128i b, __m128i c)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmsub.h vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_154">Description</h3>
+<p>Multiply 16-bit elements in <code>b</code> and <code>c</code>, negate and add elements in <code>a</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_154">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = -b.half[i] * c.half[i] + a.half[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_154">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmsub_w-__m128i-a-__m128i-b-__m128i-c">__m128i __lsx_vmsub_w (__m128i a, __m128i b, __m128i c)</h2>
+<h3 id="synopsis_155">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmsub_w (__m128i a, __m128i b, __m128i c)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmsub.w vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_155">Description</h3>
+<p>Multiply 32-bit elements in <code>b</code> and <code>c</code>, negate and add elements in <code>a</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_155">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = -b.word[i] * c.word[i] + a.word[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_155">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmsub_d-__m128i-a-__m128i-b-__m128i-c">__m128i __lsx_vmsub_d (__m128i a, __m128i b, __m128i c)</h2>
+<h3 id="synopsis_156">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmsub_d (__m128i a, __m128i b, __m128i c)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmsub.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_156">Description</h3>
+<p>Multiply 64-bit elements in <code>b</code> and <code>c</code>, negate and add elements in <code>a</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_156">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = -b.dword[i] * c.dword[i] + a.dword[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_156">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmuh_b-__m128i-a-__m128i-b">__m128i __lsx_vmuh_b (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_157">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmuh_b (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmuh.b vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_157">Description</h3>
+<p>Multiply signed 8-bit elements in <code>a</code> and <code>b</code>, save the high 8-bit result in <code>dst</code>.</p>
+<h3 id="operation_157">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = (((s16)(s8)a.byte[i] * (s16)(s8)b.byte[i])) &gt;&gt; 8;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_157">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmuh_bu-__m128i-a-__m128i-b">__m128i __lsx_vmuh_bu (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_158">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmuh_bu (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmuh.bu vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_158">Description</h3>
+<p>Multiply unsigned 8-bit elements in <code>a</code> and <code>b</code>, save the high 8-bit result in <code>dst</code>.</p>
+<h3 id="operation_158">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = (((u16)(u8)a.byte[i] * (u16)(u8)b.byte[i])) &gt;&gt; 8;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_158">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmuh_h-__m128i-a-__m128i-b">__m128i __lsx_vmuh_h (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_159">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmuh_h (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmuh.h vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_159">Description</h3>
+<p>Multiply signed 16-bit elements in <code>a</code> and <code>b</code>, save the high 16-bit result in <code>dst</code>.</p>
+<h3 id="operation_159">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = (((s32)(s16)a.half[i] * (s32)(s16)b.half[i])) &gt;&gt; 16;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_159">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmuh_hu-__m128i-a-__m128i-b">__m128i __lsx_vmuh_hu (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_160">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmuh_hu (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmuh.hu vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_160">Description</h3>
+<p>Multiply unsigned 16-bit elements in <code>a</code> and <code>b</code>, save the high 16-bit result in <code>dst</code>.</p>
+<h3 id="operation_160">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = (((u32)(u16)a.half[i] * (u32)(u16)b.half[i])) &gt;&gt; 16;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_160">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmuh_w-__m128i-a-__m128i-b">__m128i __lsx_vmuh_w (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_161">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmuh_w (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmuh.w vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_161">Description</h3>
+<p>Multiply signed 32-bit elements in <code>a</code> and <code>b</code>, save the high 32-bit result in <code>dst</code>.</p>
+<h3 id="operation_161">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = (((s64)(s32)a.word[i] * (s64)(s32)b.word[i])) &gt;&gt; 32;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_161">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmuh_wu-__m128i-a-__m128i-b">__m128i __lsx_vmuh_wu (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_162">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmuh_wu (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmuh.wu vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_162">Description</h3>
+<p>Multiply unsigned 32-bit elements in <code>a</code> and <code>b</code>, save the high 32-bit result in <code>dst</code>.</p>
+<h3 id="operation_162">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = (((u64)(u32)a.word[i] * (u64)(u32)b.word[i])) &gt;&gt; 32;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_162">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmuh_d-__m128i-a-__m128i-b">__m128i __lsx_vmuh_d (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_163">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmuh_d (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmuh.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_163">Description</h3>
+<p>Multiply signed 64-bit elements in <code>a</code> and <code>b</code>, save the high 64-bit result in <code>dst</code>.</p>
+<h3 id="operation_163">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = (((s128)(s64)a.dword[i] * (s128)(s64)b.dword[i])) &gt;&gt; 64;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_163">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmuh_du-__m128i-a-__m128i-b">__m128i __lsx_vmuh_du (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_164">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmuh_du (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmuh.du vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_164">Description</h3>
+<p>Multiply unsigned 64-bit elements in <code>a</code> and <code>b</code>, save the high 64-bit result in <code>dst</code>.</p>
+<h3 id="operation_164">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = (((u128)(u64)a.dword[i] * (u128)(u64)b.dword[i])) &gt;&gt; 64;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_164">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmul_b-__m128i-a-__m128i-b">__m128i __lsx_vmul_b (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_165">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmul_b (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmul.b vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_165">Description</h3>
+<p>Multiply 8-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_165">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = a.byte[i] * b.byte[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_165">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmul_h-__m128i-a-__m128i-b">__m128i __lsx_vmul_h (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_166">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmul_h (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmul.h vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_166">Description</h3>
+<p>Multiply 16-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_166">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = a.half[i] * b.half[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_166">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmul_w-__m128i-a-__m128i-b">__m128i __lsx_vmul_w (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_167">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmul_w (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmul.w vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_167">Description</h3>
+<p>Multiply 32-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_167">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = a.word[i] * b.word[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_167">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmul_d-__m128i-a-__m128i-b">__m128i __lsx_vmul_d (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_168">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmul_d (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmul.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_168">Description</h3>
+<p>Multiply 64-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_168">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = a.dword[i] * b.dword[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_168">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmulwev_h_b-__m128i-a-__m128i-b">__m128i __lsx_vmulwev_h_b (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_169">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmulwev_h_b (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmulwev.h.b vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_169">Description</h3>
+<p>Multiply even-positioned signed 8-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 16-bit result in <code>dst</code>.</p>
+<h3 id="operation_169">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = (s16)(s8)a.byte[2 * i] * (s16)(s8)b.byte[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_169">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmulwev_h_bu-__m128i-a-__m128i-b">__m128i __lsx_vmulwev_h_bu (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_170">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmulwev_h_bu (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmulwev.h.bu vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_170">Description</h3>
+<p>Multiply even-positioned unsigned 8-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 16-bit result in <code>dst</code>.</p>
+<h3 id="operation_170">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = (u16)(u8)a.byte[2 * i] * (u16)(u8)b.byte[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_170">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmulwev_h_bu_b-__m128i-a-__m128i-b">__m128i __lsx_vmulwev_h_bu_b (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_171">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmulwev_h_bu_b (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmulwev.h.bu.b vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_171">Description</h3>
+<p>Multiply even-positioned unsigned 8-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 16-bit result in <code>dst</code>.</p>
+<h3 id="operation_171">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = (u16)(u8)a.byte[2 * i] * (s16)(s8)b.byte[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_171">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmulwev_w_h-__m128i-a-__m128i-b">__m128i __lsx_vmulwev_w_h (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_172">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmulwev_w_h (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmulwev.w.h vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_172">Description</h3>
+<p>Multiply even-positioned signed 16-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 32-bit result in <code>dst</code>.</p>
+<h3 id="operation_172">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = (s32)(s16)a.half[2 * i] * (s32)(s16)b.half[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_172">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmulwev_w_hu-__m128i-a-__m128i-b">__m128i __lsx_vmulwev_w_hu (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_173">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmulwev_w_hu (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmulwev.w.hu vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_173">Description</h3>
+<p>Multiply even-positioned unsigned 16-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 32-bit result in <code>dst</code>.</p>
+<h3 id="operation_173">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = (u32)(u16)a.half[2 * i] * (u32)(u16)b.half[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_173">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmulwev_w_hu_h-__m128i-a-__m128i-b">__m128i __lsx_vmulwev_w_hu_h (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_174">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmulwev_w_hu_h (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmulwev.w.hu.h vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_174">Description</h3>
+<p>Multiply even-positioned unsigned 16-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 32-bit result in <code>dst</code>.</p>
+<h3 id="operation_174">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = (u32)(u16)a.half[2 * i] * (s32)(s16)b.half[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_174">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmulwev_d_w-__m128i-a-__m128i-b">__m128i __lsx_vmulwev_d_w (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_175">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmulwev_d_w (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmulwev.d.w vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_175">Description</h3>
+<p>Multiply even-positioned signed 32-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 64-bit result in <code>dst</code>.</p>
+<h3 id="operation_175">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = (s64)(s32)a.word[2 * i] * (s64)(s32)b.word[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_175">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmulwev_d_wu-__m128i-a-__m128i-b">__m128i __lsx_vmulwev_d_wu (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_176">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmulwev_d_wu (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmulwev.d.wu vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_176">Description</h3>
+<p>Multiply even-positioned unsigned 32-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 64-bit result in <code>dst</code>.</p>
+<h3 id="operation_176">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = (u64)(u32)a.word[2 * i] * (u64)(u32)b.word[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_176">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmulwev_d_wu_w-__m128i-a-__m128i-b">__m128i __lsx_vmulwev_d_wu_w (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_177">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmulwev_d_wu_w (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmulwev.d.wu.w vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_177">Description</h3>
+<p>Multiply even-positioned unsigned 32-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 64-bit result in <code>dst</code>.</p>
+<h3 id="operation_177">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = (u64)(u32)a.word[2 * i] * (s64)(s32)b.word[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_177">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmulwev_q_d-__m128i-a-__m128i-b">__m128i __lsx_vmulwev_q_d (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_178">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmulwev_q_d (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmulwev.q.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_178">Description</h3>
+<p>Multiply even-positioned signed 64-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 128-bit result in <code>dst</code>.</p>
+<h3 id="operation_178">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 1; i++) {
+  dst.qword[i] = (s128)(s64)a.dword[2 * i] * (s128)(s64)b.dword[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_178">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>7</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>7</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmulwev_q_du-__m128i-a-__m128i-b">__m128i __lsx_vmulwev_q_du (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_179">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmulwev_q_du (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmulwev.q.du vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_179">Description</h3>
+<p>Multiply even-positioned unsigned 64-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 128-bit result in <code>dst</code>.</p>
+<h3 id="operation_179">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 1; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[2 * i] * (u128)(u64)b.dword[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_179">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>7</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>7</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmulwev_q_du_d-__m128i-a-__m128i-b">__m128i __lsx_vmulwev_q_du_d (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_180">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmulwev_q_du_d (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmulwev.q.du.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_180">Description</h3>
+<p>Multiply even-positioned unsigned 64-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 128-bit result in <code>dst</code>.</p>
+<h3 id="operation_180">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 1; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[2 * i] * (s128)(s64)b.dword[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_180">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>7</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>7</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmulwod_h_b-__m128i-a-__m128i-b">__m128i __lsx_vmulwod_h_b (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_181">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmulwod_h_b (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmulwod.h.b vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_181">Description</h3>
+<p>Multiply odd-positioned signed 8-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 16-bit result in <code>dst</code>.</p>
+<h3 id="operation_181">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = (s16)(s8)a.byte[2 * i + 1] * (s16)(s8)b.byte[2 * i + 1];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_181">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmulwod_h_bu-__m128i-a-__m128i-b">__m128i __lsx_vmulwod_h_bu (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_182">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmulwod_h_bu (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmulwod.h.bu vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_182">Description</h3>
+<p>Multiply odd-positioned unsigned 8-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 16-bit result in <code>dst</code>.</p>
+<h3 id="operation_182">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = (u16)(u8)a.byte[2 * i + 1] * (u16)(u8)b.byte[2 * i + 1];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_182">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmulwod_h_bu_b-__m128i-a-__m128i-b">__m128i __lsx_vmulwod_h_bu_b (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_183">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmulwod_h_bu_b (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmulwod.h.bu.b vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_183">Description</h3>
+<p>Multiply odd-positioned unsigned 8-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 16-bit result in <code>dst</code>.</p>
+<h3 id="operation_183">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = (u16)(u8)a.byte[2 * i + 1] * (s16)(s8)b.byte[2 * i + 1];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_183">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmulwod_w_h-__m128i-a-__m128i-b">__m128i __lsx_vmulwod_w_h (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_184">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmulwod_w_h (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmulwod.w.h vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_184">Description</h3>
+<p>Multiply odd-positioned signed 16-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 32-bit result in <code>dst</code>.</p>
+<h3 id="operation_184">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = (s32)(s16)a.half[2 * i + 1] * (s32)(s16)b.half[2 * i + 1];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_184">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmulwod_w_hu-__m128i-a-__m128i-b">__m128i __lsx_vmulwod_w_hu (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_185">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmulwod_w_hu (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmulwod.w.hu vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_185">Description</h3>
+<p>Multiply odd-positioned unsigned 16-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 32-bit result in <code>dst</code>.</p>
+<h3 id="operation_185">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = (u32)(u16)a.half[2 * i + 1] * (u32)(u16)b.half[2 * i + 1];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_185">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmulwod_w_hu_h-__m128i-a-__m128i-b">__m128i __lsx_vmulwod_w_hu_h (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_186">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmulwod_w_hu_h (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmulwod.w.hu.h vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_186">Description</h3>
+<p>Multiply odd-positioned unsigned 16-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 32-bit result in <code>dst</code>.</p>
+<h3 id="operation_186">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = (u32)(u16)a.half[2 * i + 1] * (s32)(s16)b.half[2 * i + 1];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_186">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmulwod_d_w-__m128i-a-__m128i-b">__m128i __lsx_vmulwod_d_w (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_187">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmulwod_d_w (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmulwod.d.w vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_187">Description</h3>
+<p>Multiply odd-positioned signed 32-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 64-bit result in <code>dst</code>.</p>
+<h3 id="operation_187">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = (s64)(s32)a.word[2 * i + 1] * (s64)(s32)b.word[2 * i + 1];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_187">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmulwod_d_wu-__m128i-a-__m128i-b">__m128i __lsx_vmulwod_d_wu (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_188">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmulwod_d_wu (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmulwod.d.wu vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_188">Description</h3>
+<p>Multiply odd-positioned unsigned 32-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 64-bit result in <code>dst</code>.</p>
+<h3 id="operation_188">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = (u64)(u32)a.word[2 * i + 1] * (u64)(u32)b.word[2 * i + 1];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_188">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmulwod_d_wu_w-__m128i-a-__m128i-b">__m128i __lsx_vmulwod_d_wu_w (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_189">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmulwod_d_wu_w (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmulwod.d.wu.w vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_189">Description</h3>
+<p>Multiply odd-positioned unsigned 32-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 64-bit result in <code>dst</code>.</p>
+<h3 id="operation_189">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = (u64)(u32)a.word[2 * i + 1] * (s64)(s32)b.word[2 * i + 1];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_189">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmulwod_q_d-__m128i-a-__m128i-b">__m128i __lsx_vmulwod_q_d (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_190">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmulwod_q_d (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmulwod.q.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_190">Description</h3>
+<p>Multiply odd-positioned signed 64-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 128-bit result in <code>dst</code>.</p>
+<h3 id="operation_190">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 1; i++) {
+  dst.qword[i] = (s128)(s64)a.dword[2 * i + 1] * (s128)(s64)b.dword[2 * i + 1];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_190">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>7</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>7</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmulwod_q_du-__m128i-a-__m128i-b">__m128i __lsx_vmulwod_q_du (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_191">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmulwod_q_du (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmulwod.q.du vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_191">Description</h3>
+<p>Multiply odd-positioned unsigned 64-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 128-bit result in <code>dst</code>.</p>
+<h3 id="operation_191">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 1; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] * (u128)(u64)b.dword[2 * i + 1];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_191">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>7</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>7</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmulwod_q_du_d-__m128i-a-__m128i-b">__m128i __lsx_vmulwod_q_du_d (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_192">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmulwod_q_du_d (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmulwod.q.du.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_192">Description</h3>
+<p>Multiply odd-positioned unsigned 64-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 128-bit result in <code>dst</code>.</p>
+<h3 id="operation_192">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 1; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] * (s128)(s64)b.dword[2 * i + 1];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_192">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>7</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>7</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vneg_b-__m128i-a">__m128i __lsx_vneg_b (__m128i a)</h2>
+<h3 id="synopsis_193">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vneg_b (__m128i a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vneg.b vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_193">Description</h3>
+<p>Negate 8-bit elements in <code>a</code> and save the result in <code>dst</code>.</p>
+<h3 id="operation_193">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = -a.byte[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_193">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vneg_h-__m128i-a">__m128i __lsx_vneg_h (__m128i a)</h2>
+<h3 id="synopsis_194">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vneg_h (__m128i a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vneg.h vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_194">Description</h3>
+<p>Negate 16-bit elements in <code>a</code> and save the result in <code>dst</code>.</p>
+<h3 id="operation_194">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = -a.half[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_194">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vneg_w-__m128i-a">__m128i __lsx_vneg_w (__m128i a)</h2>
+<h3 id="synopsis_195">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vneg_w (__m128i a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vneg.w vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_195">Description</h3>
+<p>Negate 32-bit elements in <code>a</code> and save the result in <code>dst</code>.</p>
+<h3 id="operation_195">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = -a.word[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_195">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vneg_d-__m128i-a">__m128i __lsx_vneg_d (__m128i a)</h2>
+<h3 id="synopsis_196">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vneg_d (__m128i a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vneg.d vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_196">Description</h3>
+<p>Negate 64-bit elements in <code>a</code> and save the result in <code>dst</code>.</p>
+<h3 id="operation_196">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = -a.dword[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_196">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsadd_b-__m128i-a-__m128i-b">__m128i __lsx_vsadd_b (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_197">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsadd_b (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsadd.b vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_197">Description</h3>
+<p>Saturing add the signed 8-bit elements in <code>a</code> and <code>b</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_197">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = (s8)sadd((s8)a.byte[i], (s8)b.byte[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_197">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsadd_bu-__m128i-a-__m128i-b">__m128i __lsx_vsadd_bu (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_198">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsadd_bu (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsadd.bu vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_198">Description</h3>
+<p>Saturing add the unsigned 8-bit elements in <code>a</code> and <code>b</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_198">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = (u8)sadd((u8)a.byte[i], (u8)b.byte[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_198">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsadd_h-__m128i-a-__m128i-b">__m128i __lsx_vsadd_h (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_199">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsadd_h (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsadd.h vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_199">Description</h3>
+<p>Saturing add the signed 16-bit elements in <code>a</code> and <code>b</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_199">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = (s16)sadd((s16)a.half[i], (s16)b.half[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_199">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsadd_hu-__m128i-a-__m128i-b">__m128i __lsx_vsadd_hu (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_200">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsadd_hu (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsadd.hu vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_200">Description</h3>
+<p>Saturing add the unsigned 16-bit elements in <code>a</code> and <code>b</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_200">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = (u16)sadd((u16)a.half[i], (u16)b.half[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_200">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsadd_w-__m128i-a-__m128i-b">__m128i __lsx_vsadd_w (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_201">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsadd_w (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsadd.w vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_201">Description</h3>
+<p>Saturing add the signed 32-bit elements in <code>a</code> and <code>b</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_201">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = (s32)sadd((s32)a.word[i], (s32)b.word[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_201">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsadd_wu-__m128i-a-__m128i-b">__m128i __lsx_vsadd_wu (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_202">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsadd_wu (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsadd.wu vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_202">Description</h3>
+<p>Saturing add the unsigned 32-bit elements in <code>a</code> and <code>b</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_202">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = (u32)sadd((u32)a.word[i], (u32)b.word[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_202">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsadd_d-__m128i-a-__m128i-b">__m128i __lsx_vsadd_d (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_203">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsadd_d (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsadd.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_203">Description</h3>
+<p>Saturing add the signed 64-bit elements in <code>a</code> and <code>b</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_203">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = (s64)sadd((s64)a.dword[i], (s64)b.dword[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_203">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsadd_du-__m128i-a-__m128i-b">__m128i __lsx_vsadd_du (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_204">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsadd_du (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsadd.du vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_204">Description</h3>
+<p>Saturing add the unsigned 64-bit elements in <code>a</code> and <code>b</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_204">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = (u64)sadd((u64)a.dword[i], (u64)b.dword[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_204">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vssub_b-__m128i-a-__m128i-b">__m128i __lsx_vssub_b (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_205">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vssub_b (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vssub.b vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_205">Description</h3>
+<p>Saturing subtract the signed 8-bit elements in <code>a</code> and <code>b</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_205">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = (s8)ssub((s8)a.byte[i], (s8)b.byte[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_205">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vssub_bu-__m128i-a-__m128i-b">__m128i __lsx_vssub_bu (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_206">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vssub_bu (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vssub.bu vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_206">Description</h3>
+<p>Saturing subtract the unsigned 8-bit elements in <code>a</code> and <code>b</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_206">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = (u8)ssub((u8)a.byte[i], (u8)b.byte[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_206">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vssub_h-__m128i-a-__m128i-b">__m128i __lsx_vssub_h (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_207">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vssub_h (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vssub.h vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_207">Description</h3>
+<p>Saturing subtract the signed 16-bit elements in <code>a</code> and <code>b</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_207">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = (s16)ssub((s16)a.half[i], (s16)b.half[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_207">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vssub_hu-__m128i-a-__m128i-b">__m128i __lsx_vssub_hu (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_208">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vssub_hu (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vssub.hu vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_208">Description</h3>
+<p>Saturing subtract the unsigned 16-bit elements in <code>a</code> and <code>b</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_208">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = (u16)ssub((u16)a.half[i], (u16)b.half[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_208">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vssub_w-__m128i-a-__m128i-b">__m128i __lsx_vssub_w (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_209">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vssub_w (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vssub.w vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_209">Description</h3>
+<p>Saturing subtract the signed 32-bit elements in <code>a</code> and <code>b</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_209">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = (s32)ssub((s32)a.word[i], (s32)b.word[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_209">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vssub_wu-__m128i-a-__m128i-b">__m128i __lsx_vssub_wu (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_210">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vssub_wu (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vssub.wu vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_210">Description</h3>
+<p>Saturing subtract the unsigned 32-bit elements in <code>a</code> and <code>b</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_210">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = (u32)ssub((u32)a.word[i], (u32)b.word[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_210">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vssub_d-__m128i-a-__m128i-b">__m128i __lsx_vssub_d (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_211">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vssub_d (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vssub.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_211">Description</h3>
+<p>Saturing subtract the signed 64-bit elements in <code>a</code> and <code>b</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_211">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = (s64)ssub((s64)a.dword[i], (s64)b.dword[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_211">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vssub_du-__m128i-a-__m128i-b">__m128i __lsx_vssub_du (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_212">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vssub_du (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vssub.du vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_212">Description</h3>
+<p>Saturing subtract the unsigned 64-bit elements in <code>a</code> and <code>b</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_212">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = (u64)ssub((u64)a.dword[i], (u64)b.dword[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_212">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsub_b-__m128i-a-__m128i-b">__m128i __lsx_vsub_b (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_213">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsub_b (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsub.b vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_213">Description</h3>
+<p>Subtract 8-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_213">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = a.byte[i] - b.byte[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_213">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsub_h-__m128i-a-__m128i-b">__m128i __lsx_vsub_h (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_214">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsub_h (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsub.h vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_214">Description</h3>
+<p>Subtract 16-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_214">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = a.half[i] - b.half[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_214">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsub_w-__m128i-a-__m128i-b">__m128i __lsx_vsub_w (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_215">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsub_w (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsub.w vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_215">Description</h3>
+<p>Subtract 32-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_215">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = a.word[i] - b.word[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_215">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsub_d-__m128i-a-__m128i-b">__m128i __lsx_vsub_d (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_216">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsub_d (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsub.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_216">Description</h3>
+<p>Subtract 64-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_216">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = a.dword[i] - b.dword[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_216">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsub_q-__m128i-a-__m128i-b">__m128i __lsx_vsub_q (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_217">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsub_q (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsub.q vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_217">Description</h3>
+<p>Subtract 128-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_217">Operation</h3>
+<pre><code class="language-c++">dst.qword[0] = a.qword[0] - b.qword[0];
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_217">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsubi_bu-__m128i-a-imm0_31-imm">__m128i __lsx_vsubi_bu (__m128i a, imm0_31 imm)</h2>
+<h3 id="synopsis_218">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsubi_bu (__m128i a, imm0_31 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsubi.bu vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_218">Description</h3>
+<p>Subtract 8-bit elements in <code>a</code> by <code>imm</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_218">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = a.byte[i] - imm;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_218">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsubi_hu-__m128i-a-imm0_31-imm">__m128i __lsx_vsubi_hu (__m128i a, imm0_31 imm)</h2>
+<h3 id="synopsis_219">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsubi_hu (__m128i a, imm0_31 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsubi.hu vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_219">Description</h3>
+<p>Subtract 16-bit elements in <code>a</code> by <code>imm</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_219">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = a.half[i] - imm;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_219">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsubi_wu-__m128i-a-imm0_31-imm">__m128i __lsx_vsubi_wu (__m128i a, imm0_31 imm)</h2>
+<h3 id="synopsis_220">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsubi_wu (__m128i a, imm0_31 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsubi.wu vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_220">Description</h3>
+<p>Subtract 32-bit elements in <code>a</code> by <code>imm</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_220">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = a.word[i] - imm;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_220">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsubi_du-__m128i-a-imm0_31-imm">__m128i __lsx_vsubi_du (__m128i a, imm0_31 imm)</h2>
+<h3 id="synopsis_221">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsubi_du (__m128i a, imm0_31 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsubi.du vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_221">Description</h3>
+<p>Subtract 64-bit elements in <code>a</code> by <code>imm</code>, save the result in <code>dst</code>.</p>
+<h3 id="operation_221">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = a.dword[i] - imm;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_221">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsubwev_h_b-__m128i-a-__m128i-b">__m128i __lsx_vsubwev_h_b (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_222">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsubwev_h_b (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsubwev.h.b vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_222">Description</h3>
+<p>Subtract even-positioned signed 8-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 16-bit result in <code>dst</code>.</p>
+<h3 id="operation_222">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = (s16)(s8)a.byte[2 * i] - (s16)(s8)b.byte[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_222">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsubwev_h_bu-__m128i-a-__m128i-b">__m128i __lsx_vsubwev_h_bu (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_223">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsubwev_h_bu (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsubwev.h.bu vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_223">Description</h3>
+<p>Subtract even-positioned unsigned 8-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 16-bit result in <code>dst</code>.</p>
+<h3 id="operation_223">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = (u16)(u8)a.byte[2 * i] - (u16)(u8)b.byte[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_223">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsubwev_w_h-__m128i-a-__m128i-b">__m128i __lsx_vsubwev_w_h (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_224">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsubwev_w_h (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsubwev.w.h vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_224">Description</h3>
+<p>Subtract even-positioned signed 16-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 32-bit result in <code>dst</code>.</p>
+<h3 id="operation_224">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = (s32)(s16)a.half[2 * i] - (s32)(s16)b.half[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_224">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsubwev_w_hu-__m128i-a-__m128i-b">__m128i __lsx_vsubwev_w_hu (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_225">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsubwev_w_hu (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsubwev.w.hu vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_225">Description</h3>
+<p>Subtract even-positioned unsigned 16-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 32-bit result in <code>dst</code>.</p>
+<h3 id="operation_225">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = (u32)(u16)a.half[2 * i] - (u32)(u16)b.half[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_225">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsubwev_d_w-__m128i-a-__m128i-b">__m128i __lsx_vsubwev_d_w (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_226">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsubwev_d_w (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsubwev.d.w vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_226">Description</h3>
+<p>Subtract even-positioned signed 32-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 64-bit result in <code>dst</code>.</p>
+<h3 id="operation_226">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = (s64)(s32)a.word[2 * i] - (s64)(s32)b.word[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_226">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsubwev_d_wu-__m128i-a-__m128i-b">__m128i __lsx_vsubwev_d_wu (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_227">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsubwev_d_wu (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsubwev.d.wu vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_227">Description</h3>
+<p>Subtract even-positioned unsigned 32-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 64-bit result in <code>dst</code>.</p>
+<h3 id="operation_227">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = (u64)(u32)a.word[2 * i] - (u64)(u32)b.word[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_227">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsubwev_q_d-__m128i-a-__m128i-b">__m128i __lsx_vsubwev_q_d (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_228">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsubwev_q_d (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsubwev.q.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_228">Description</h3>
+<p>Subtract even-positioned signed 64-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 128-bit result in <code>dst</code>.</p>
+<h3 id="operation_228">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 1; i++) {
+  dst.qword[i] = (s128)(s64)a.dword[2 * i] - (s128)(s64)b.dword[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_228">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsubwev_q_du-__m128i-a-__m128i-b">__m128i __lsx_vsubwev_q_du (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_229">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsubwev_q_du (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsubwev.q.du vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_229">Description</h3>
+<p>Subtract even-positioned unsigned 64-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 128-bit result in <code>dst</code>.</p>
+<h3 id="operation_229">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 1; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[2 * i] - (u128)(u64)b.dword[2 * i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_229">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsubwod_h_b-__m128i-a-__m128i-b">__m128i __lsx_vsubwod_h_b (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_230">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsubwod_h_b (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsubwod.h.b vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_230">Description</h3>
+<p>Subtract odd-positioned signed 8-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 16-bit result in <code>dst</code>.</p>
+<h3 id="operation_230">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = (s16)(s8)a.byte[2 * i + 1] - (s16)(s8)b.byte[2 * i + 1];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_230">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsubwod_h_bu-__m128i-a-__m128i-b">__m128i __lsx_vsubwod_h_bu (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_231">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsubwod_h_bu (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsubwod.h.bu vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_231">Description</h3>
+<p>Subtract odd-positioned unsigned 8-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 16-bit result in <code>dst</code>.</p>
+<h3 id="operation_231">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = (u16)(u8)a.byte[2 * i + 1] - (u16)(u8)b.byte[2 * i + 1];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_231">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsubwod_w_h-__m128i-a-__m128i-b">__m128i __lsx_vsubwod_w_h (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_232">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsubwod_w_h (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsubwod.w.h vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_232">Description</h3>
+<p>Subtract odd-positioned signed 16-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 32-bit result in <code>dst</code>.</p>
+<h3 id="operation_232">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = (s32)(s16)a.half[2 * i + 1] - (s32)(s16)b.half[2 * i + 1];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_232">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsubwod_w_hu-__m128i-a-__m128i-b">__m128i __lsx_vsubwod_w_hu (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_233">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsubwod_w_hu (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsubwod.w.hu vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_233">Description</h3>
+<p>Subtract odd-positioned unsigned 16-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 32-bit result in <code>dst</code>.</p>
+<h3 id="operation_233">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = (u32)(u16)a.half[2 * i + 1] - (u32)(u16)b.half[2 * i + 1];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_233">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsubwod_d_w-__m128i-a-__m128i-b">__m128i __lsx_vsubwod_d_w (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_234">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsubwod_d_w (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsubwod.d.w vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_234">Description</h3>
+<p>Subtract odd-positioned signed 32-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 64-bit result in <code>dst</code>.</p>
+<h3 id="operation_234">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = (s64)(s32)a.word[2 * i + 1] - (s64)(s32)b.word[2 * i + 1];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_234">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsubwod_d_wu-__m128i-a-__m128i-b">__m128i __lsx_vsubwod_d_wu (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_235">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsubwod_d_wu (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsubwod.d.wu vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_235">Description</h3>
+<p>Subtract odd-positioned unsigned 32-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 64-bit result in <code>dst</code>.</p>
+<h3 id="operation_235">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = (u64)(u32)a.word[2 * i + 1] - (u64)(u32)b.word[2 * i + 1];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_235">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsubwod_q_d-__m128i-a-__m128i-b">__m128i __lsx_vsubwod_q_d (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_236">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsubwod_q_d (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsubwod.q.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_236">Description</h3>
+<p>Subtract odd-positioned signed 64-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 128-bit result in <code>dst</code>.</p>
+<h3 id="operation_236">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 1; i++) {
+  dst.qword[i] = (s128)(s64)a.dword[2 * i + 1] - (s128)(s64)b.dword[2 * i + 1];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_236">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsubwod_q_du-__m128i-a-__m128i-b">__m128i __lsx_vsubwod_q_du (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_237">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsubwod_q_du (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsubwod.q.du vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_237">Description</h3>
+<p>Subtract odd-positioned unsigned 64-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 128-bit result in <code>dst</code>.</p>
+<h3 id="operation_237">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 1; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] - (u128)(u64)b.dword[2 * i + 1];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_237">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+              
+            </div>
+          </div><footer>
+    <div class="rst-footer-buttons" role="navigation" aria-label="Footer Navigation">
+        <a href="../integer_comparison/" class="btn btn-neutral float-left" title="Integer Comparison"><span class="icon icon-circle-arrow-left"></span> Previous</a>
+        <a href="../logical/" class="btn btn-neutral float-right" title="Logical">Next <span class="icon icon-circle-arrow-right"></span></a>
+    </div>
+
+  <hr/>
+
+  <div role="contentinfo">
+    <!-- Copyright etc -->
+      <p>Copyright &copy; 2023 Jiajie Chen</p>
+  </div>
+
+  Built with <a href="https://www.mkdocs.org/">MkDocs</a> using a <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
+</footer>
+          
+        </div>
+      </div>
+
+    </section>
+
+  </div>
+
+  <div class="rst-versions" role="note" aria-label="Versions">
+  <span class="rst-current-version" data-toggle="rst-current-version">
+    
+    
+      <span><a href="../integer_comparison/" style="color: #fcfcfc">&laquo; Previous</a></span>
+    
+    
+      <span><a href="../logical/" style="color: #fcfcfc">Next &raquo;</a></span>
+    
+  </span>
+</div>
+    <script src="../../js/jquery-3.6.0.min.js"></script>
+    <script>var base_url = "../..";</script>
+    <script src="../../js/theme_extra.js"></script>
+    <script src="../../js/theme.js"></script>
+    <script>
+        jQuery(function () {
+            SphinxRtdTheme.Navigation.enable(true);
+        });
+    </script>
+
+</body>
+</html>
diff --git a/lsx/logical/index.html b/lsx/logical/index.html
new file mode 100644
index 00000000..8c2636cd
--- /dev/null
+++ b/lsx/logical/index.html
@@ -0,0 +1,689 @@
+<!DOCTYPE html>
+<html class="writer-html5" lang="en" >
+<head>
+    <meta charset="utf-8" />
+    <meta http-equiv="X-UA-Compatible" content="IE=edge" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" /><link rel="canonical" href="https://jia.je/unofficial-loongarch-intrinsics-guide/lsx/logical/" />
+      <link rel="shortcut icon" href="../../img/favicon.ico" />
+    <title>Logical - Unofficial LoongArch Intrinsics Guide</title>
+    <link rel="stylesheet" href="../../css/theme.css" />
+    <link rel="stylesheet" href="../../css/theme_extra.css" />
+        <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.8.0/styles/github.min.css" />
+        <link href="../../main.css" rel="stylesheet" />
+    
+      <script>
+        // Current page data
+        var mkdocs_page_name = "Logical";
+        var mkdocs_page_input_path = "lsx/logical.md";
+        var mkdocs_page_url = "/unofficial-loongarch-intrinsics-guide/lsx/logical/";
+      </script>
+    
+    <!--[if lt IE 9]>
+      <script src="../../js/html5shiv.min.js"></script>
+    <![endif]-->
+      <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.8.0/highlight.min.js"></script>
+      <script>hljs.highlightAll();</script> 
+</head>
+
+<body class="wy-body-for-nav" role="document">
+
+  <div class="wy-grid-for-nav">
+    <nav data-toggle="wy-nav-shift" class="wy-nav-side stickynav">
+    <div class="wy-side-scroll">
+      <div class="wy-side-nav-search">
+          <a href="../.." class="icon icon-home"> Unofficial LoongArch Intrinsics Guide
+        </a>
+      </div>
+
+      <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../..">Unofficial LoongArch Intrinsics Guide</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../latency_throughput/">Latency and Throughput of Instructions</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../migrating_avx/">Migrating from AVX to LASX</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../migrating_sse/">Migrating from SSE to LSX</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../viewer/">Browse All Intrinsics</a>
+                </li>
+              </ul>
+              <p class="caption"><span class="caption-text">Lasx</span></p>
+              <ul>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/bitwise_operations/">Bitwise Operations</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/branch/">Branch</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/float_comparison/">Floating Point Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/float_computation/">Floating Point Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/float_conversion/">Floating Point Conversion</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/float_misc/">Floating Point Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/fma/">Fused Multiply-Add</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/integer_comparison/">Integer Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/integer_computation/">Integer Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/logical/">Logical</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/memory/">Memory Load & Store</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/misc/">Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/permutation/">Permutation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/shift/">Shift</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/shuffling/">Shuffling</a>
+                  </li>
+              </ul>
+              <p class="caption"><span class="caption-text">Lsx</span></p>
+              <ul class="current">
+                  <li class="toctree-l1"><a class="reference internal" href="../bitwise_operations/">Bitwise Operations</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../branch/">Branch</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_comparison/">Floating Point Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_computation/">Floating Point Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_conversion/">Floating Point Conversion</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_misc/">Floating Point Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../fma/">Fused Multiply-Add</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../integer_comparison/">Integer Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../integer_computation/">Integer Computation</a>
+                  </li>
+                  <li class="toctree-l1 current"><a class="reference internal current" href="./">Logical</a>
+    <ul class="current">
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vand_v-__m128i-a-__m128i-b">__m128i __lsx_vand_v (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vandi_b-__m128i-a-imm0_255-imm">__m128i __lsx_vandi_b (__m128i a, imm0_255 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_1">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_1">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_1">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_1">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vandn_v-__m128i-a-__m128i-b">__m128i __lsx_vandn_v (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_2">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_2">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_2">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_2">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vnor_v-__m128i-a-__m128i-b">__m128i __lsx_vnor_v (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_3">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_3">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_3">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_3">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vnori_b-__m128i-a-imm0_255-imm">__m128i __lsx_vnori_b (__m128i a, imm0_255 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_4">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_4">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_4">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_4">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vor_v-__m128i-a-__m128i-b">__m128i __lsx_vor_v (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_5">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_5">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_5">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_5">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vori_b-__m128i-a-imm0_255-imm">__m128i __lsx_vori_b (__m128i a, imm0_255 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_6">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_6">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_6">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_6">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vorn_v-__m128i-a-__m128i-b">__m128i __lsx_vorn_v (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_7">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_7">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_7">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_7">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vxor_v-__m128i-a-__m128i-b">__m128i __lsx_vxor_v (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_8">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_8">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_8">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_8">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vxori_b-__m128i-a-imm0_255-imm">__m128i __lsx_vxori_b (__m128i a, imm0_255 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_9">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_9">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_9">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_9">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    </ul>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../memory/">Memory Load & Store</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../misc/">Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../permutation/">Permutation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../shift/">Shift</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../shuffling/">Shuffling</a>
+                  </li>
+              </ul>
+      </div>
+    </div>
+    </nav>
+
+    <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
+      <nav class="wy-nav-top" role="navigation" aria-label="Mobile navigation menu">
+          <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
+          <a href="../..">Unofficial LoongArch Intrinsics Guide</a>
+        
+      </nav>
+      <div class="wy-nav-content">
+        <div class="rst-content"><div role="navigation" aria-label="breadcrumbs navigation">
+  <ul class="wy-breadcrumbs">
+    <li><a href="../.." class="icon icon-home" aria-label="Docs"></a></li>
+          <li class="breadcrumb-item">Lsx</li>
+      <li class="breadcrumb-item active">Logical</li>
+    <li class="wy-breadcrumbs-aside">
+    </li>
+  </ul>
+  <hr/>
+</div>
+          <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
+            <div class="section" itemprop="articleBody">
+              
+                <h1 id="logical">Logical</h1>
+<h2 id="__m128i-__lsx_vand_v-__m128i-a-__m128i-b">__m128i __lsx_vand_v (__m128i a, __m128i b)</h2>
+<h3 id="synopsis">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vand_v (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vand.v vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description">Description</h3>
+<p>Compute bitwise AND between elements in <code>a</code> and <code>b</code>.</p>
+<h3 id="operation">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = a.dword[i] &amp; b.dword[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vandi_b-__m128i-a-imm0_255-imm">__m128i __lsx_vandi_b (__m128i a, imm0_255 imm)</h2>
+<h3 id="synopsis_1">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vandi_b (__m128i a, imm0_255 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vandi.b vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_1">Description</h3>
+<p>Compute bitwise AND between elements in <code>a</code> and <code>imm</code>.</p>
+<h3 id="operation_1">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = a.byte[i] &amp; imm;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_1">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vandn_v-__m128i-a-__m128i-b">__m128i __lsx_vandn_v (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_2">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vandn_v (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vandn.v vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_2">Description</h3>
+<p>Compute bitwise ANDN between elements in <code>a</code> and <code>b</code>.</p>
+<h3 id="operation_2">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = b.dword[i] &amp; (~a.dword[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_2">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vnor_v-__m128i-a-__m128i-b">__m128i __lsx_vnor_v (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_3">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vnor_v (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vnor.v vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_3">Description</h3>
+<p>Compute bitwise NOR between elements in <code>a</code> and <code>b</code>.</p>
+<h3 id="operation_3">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = ~(a.dword[i] | b.dword[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_3">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vnori_b-__m128i-a-imm0_255-imm">__m128i __lsx_vnori_b (__m128i a, imm0_255 imm)</h2>
+<h3 id="synopsis_4">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vnori_b (__m128i a, imm0_255 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vnori.b vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_4">Description</h3>
+<p>Compute bitwise NOR between elements in <code>a</code> and <code>imm</code>.</p>
+<h3 id="operation_4">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = ~(a.byte[i] | imm);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_4">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vor_v-__m128i-a-__m128i-b">__m128i __lsx_vor_v (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_5">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vor_v (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vor.v vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_5">Description</h3>
+<p>Compute bitwise OR between elements in <code>a</code> and <code>b</code>.</p>
+<h3 id="operation_5">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = a.dword[i] | b.dword[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_5">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vori_b-__m128i-a-imm0_255-imm">__m128i __lsx_vori_b (__m128i a, imm0_255 imm)</h2>
+<h3 id="synopsis_6">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vori_b (__m128i a, imm0_255 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vori.b vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_6">Description</h3>
+<p>Compute bitwise OR between elements in <code>a</code> and <code>imm</code>.</p>
+<h3 id="operation_6">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = a.byte[i] | imm;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_6">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vorn_v-__m128i-a-__m128i-b">__m128i __lsx_vorn_v (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_7">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vorn_v (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vorn.v vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_7">Description</h3>
+<p>Compute bitwise ORN between elements in <code>a</code> and <code>b</code>.</p>
+<h3 id="operation_7">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = a.dword[i] | (~b.dword[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_7">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vxor_v-__m128i-a-__m128i-b">__m128i __lsx_vxor_v (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_8">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vxor_v (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vxor.v vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_8">Description</h3>
+<p>Compute bitwise XOR between elements in <code>a</code> and <code>b</code>.</p>
+<h3 id="operation_8">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = a.dword[i] ^ b.dword[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_8">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vxori_b-__m128i-a-imm0_255-imm">__m128i __lsx_vxori_b (__m128i a, imm0_255 imm)</h2>
+<h3 id="synopsis_9">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vxori_b (__m128i a, imm0_255 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vxori.b vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_9">Description</h3>
+<p>Compute bitwise XOR between elements in <code>a</code> and <code>imm</code>.</p>
+<h3 id="operation_9">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = a.byte[i] ^ imm;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_9">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+              
+            </div>
+          </div><footer>
+    <div class="rst-footer-buttons" role="navigation" aria-label="Footer Navigation">
+        <a href="../integer_computation/" class="btn btn-neutral float-left" title="Integer Computation"><span class="icon icon-circle-arrow-left"></span> Previous</a>
+        <a href="../memory/" class="btn btn-neutral float-right" title="Memory Load & Store">Next <span class="icon icon-circle-arrow-right"></span></a>
+    </div>
+
+  <hr/>
+
+  <div role="contentinfo">
+    <!-- Copyright etc -->
+      <p>Copyright &copy; 2023 Jiajie Chen</p>
+  </div>
+
+  Built with <a href="https://www.mkdocs.org/">MkDocs</a> using a <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
+</footer>
+          
+        </div>
+      </div>
+
+    </section>
+
+  </div>
+
+  <div class="rst-versions" role="note" aria-label="Versions">
+  <span class="rst-current-version" data-toggle="rst-current-version">
+    
+    
+      <span><a href="../integer_computation/" style="color: #fcfcfc">&laquo; Previous</a></span>
+    
+    
+      <span><a href="../memory/" style="color: #fcfcfc">Next &raquo;</a></span>
+    
+  </span>
+</div>
+    <script src="../../js/jquery-3.6.0.min.js"></script>
+    <script>var base_url = "../..";</script>
+    <script src="../../js/theme_extra.js"></script>
+    <script src="../../js/theme.js"></script>
+    <script>
+        jQuery(function () {
+            SphinxRtdTheme.Navigation.enable(true);
+        });
+    </script>
+
+</body>
+</html>
diff --git a/lsx/memory/index.html b/lsx/memory/index.html
new file mode 100644
index 00000000..16cfc29f
--- /dev/null
+++ b/lsx/memory/index.html
@@ -0,0 +1,475 @@
+<!DOCTYPE html>
+<html class="writer-html5" lang="en" >
+<head>
+    <meta charset="utf-8" />
+    <meta http-equiv="X-UA-Compatible" content="IE=edge" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" /><link rel="canonical" href="https://jia.je/unofficial-loongarch-intrinsics-guide/lsx/memory/" />
+      <link rel="shortcut icon" href="../../img/favicon.ico" />
+    <title>Memory Load & Store - Unofficial LoongArch Intrinsics Guide</title>
+    <link rel="stylesheet" href="../../css/theme.css" />
+    <link rel="stylesheet" href="../../css/theme_extra.css" />
+        <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.8.0/styles/github.min.css" />
+        <link href="../../main.css" rel="stylesheet" />
+    
+      <script>
+        // Current page data
+        var mkdocs_page_name = "Memory Load \u0026 Store";
+        var mkdocs_page_input_path = "lsx/memory.md";
+        var mkdocs_page_url = "/unofficial-loongarch-intrinsics-guide/lsx/memory/";
+      </script>
+    
+    <!--[if lt IE 9]>
+      <script src="../../js/html5shiv.min.js"></script>
+    <![endif]-->
+      <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.8.0/highlight.min.js"></script>
+      <script>hljs.highlightAll();</script> 
+</head>
+
+<body class="wy-body-for-nav" role="document">
+
+  <div class="wy-grid-for-nav">
+    <nav data-toggle="wy-nav-shift" class="wy-nav-side stickynav">
+    <div class="wy-side-scroll">
+      <div class="wy-side-nav-search">
+          <a href="../.." class="icon icon-home"> Unofficial LoongArch Intrinsics Guide
+        </a>
+      </div>
+
+      <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../..">Unofficial LoongArch Intrinsics Guide</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../latency_throughput/">Latency and Throughput of Instructions</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../migrating_avx/">Migrating from AVX to LASX</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../migrating_sse/">Migrating from SSE to LSX</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../viewer/">Browse All Intrinsics</a>
+                </li>
+              </ul>
+              <p class="caption"><span class="caption-text">Lasx</span></p>
+              <ul>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/bitwise_operations/">Bitwise Operations</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/branch/">Branch</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/float_comparison/">Floating Point Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/float_computation/">Floating Point Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/float_conversion/">Floating Point Conversion</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/float_misc/">Floating Point Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/fma/">Fused Multiply-Add</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/integer_comparison/">Integer Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/integer_computation/">Integer Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/logical/">Logical</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/memory/">Memory Load & Store</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/misc/">Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/permutation/">Permutation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/shift/">Shift</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/shuffling/">Shuffling</a>
+                  </li>
+              </ul>
+              <p class="caption"><span class="caption-text">Lsx</span></p>
+              <ul class="current">
+                  <li class="toctree-l1"><a class="reference internal" href="../bitwise_operations/">Bitwise Operations</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../branch/">Branch</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_comparison/">Floating Point Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_computation/">Floating Point Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_conversion/">Floating Point Conversion</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_misc/">Floating Point Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../fma/">Fused Multiply-Add</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../integer_comparison/">Integer Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../integer_computation/">Integer Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../logical/">Logical</a>
+                  </li>
+                  <li class="toctree-l1 current"><a class="reference internal current" href="./">Memory Load & Store</a>
+    <ul class="current">
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vld-void-addr-imm_n2048_2047-offset">__m128i __lsx_vld (void * addr, imm_n2048_2047 offset)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation">Operation</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vldx-void-addr-long-int-offset">__m128i __lsx_vldx (void * addr, long int offset)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_1">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_1">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_1">Operation</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vldrepl_b-void-addr-imm_n2048_2047-offset">__m128i __lsx_vldrepl_b (void * addr, imm_n2048_2047 offset)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_2">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_2">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_2">Operation</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vldrepl_h-void-addr-imm_n1024_1023-offset">__m128i __lsx_vldrepl_h (void * addr, imm_n1024_1023 offset)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_3">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_3">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_3">Operation</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vldrepl_w-void-addr-imm_n512_511-offset">__m128i __lsx_vldrepl_w (void * addr, imm_n512_511 offset)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_4">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_4">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_4">Operation</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vldrepl_d-void-addr-imm_n256_255-offset">__m128i __lsx_vldrepl_d (void * addr, imm_n256_255 offset)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_5">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_5">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_5">Operation</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#void-__lsx_vst-__m128i-data-void-addr-imm_n2048_2047-offset">void __lsx_vst (__m128i data, void * addr, imm_n2048_2047 offset)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_6">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_6">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_6">Operation</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#void-__lsx_vstx-__m128i-data-void-addr-long-int-offset">void __lsx_vstx (__m128i data, void * addr, long int offset)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_7">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_7">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_7">Operation</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#void-__lsx_vstelm_b-__m128i-data-void-addr-imm_n128_127-offset-imm0_15-lane">void __lsx_vstelm_b (__m128i data, void * addr, imm_n128_127 offset, imm0_15 lane)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_8">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_8">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_8">Operation</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#void-__lsx_vstelm_h-__m128i-data-void-addr-imm_n128_127-offset-imm0_7-lane">void __lsx_vstelm_h (__m128i data, void * addr, imm_n128_127 offset, imm0_7 lane)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_9">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_9">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_9">Operation</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#void-__lsx_vstelm_w-__m128i-data-void-addr-imm_n128_127-offset-imm0_3-lane">void __lsx_vstelm_w (__m128i data, void * addr, imm_n128_127 offset, imm0_3 lane)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_10">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_10">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_10">Operation</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#void-__lsx_vstelm_d-__m128i-data-void-addr-imm_n128_127-offset-imm0_1-lane">void __lsx_vstelm_d (__m128i data, void * addr, imm_n128_127 offset, imm0_1 lane)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_11">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_11">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_11">Operation</a>
+    </li>
+        </ul>
+    </li>
+    </ul>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../misc/">Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../permutation/">Permutation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../shift/">Shift</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../shuffling/">Shuffling</a>
+                  </li>
+              </ul>
+      </div>
+    </div>
+    </nav>
+
+    <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
+      <nav class="wy-nav-top" role="navigation" aria-label="Mobile navigation menu">
+          <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
+          <a href="../..">Unofficial LoongArch Intrinsics Guide</a>
+        
+      </nav>
+      <div class="wy-nav-content">
+        <div class="rst-content"><div role="navigation" aria-label="breadcrumbs navigation">
+  <ul class="wy-breadcrumbs">
+    <li><a href="../.." class="icon icon-home" aria-label="Docs"></a></li>
+          <li class="breadcrumb-item">Lsx</li>
+      <li class="breadcrumb-item active">Memory Load & Store</li>
+    <li class="wy-breadcrumbs-aside">
+    </li>
+  </ul>
+  <hr/>
+</div>
+          <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
+            <div class="section" itemprop="articleBody">
+              
+                <h1 id="memory-load-store">Memory Load &amp; Store</h1>
+<h2 id="__m128i-__lsx_vld-void-addr-imm_n2048_2047-offset">__m128i __lsx_vld (void * addr, imm_n2048_2047 offset)</h2>
+<h3 id="synopsis">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vld (void * addr, imm_n2048_2047 offset)
+#include &lt;lsxintrin.h&gt;
+Instruction: vld vr, r, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description">Description</h3>
+<p>Read whole vector from memory address <code>addr + offset</code>, save the data into <code>dst</code>. Note that you can use this intrinsic to load floating point vectors, even though the return type represents integer vectors.</p>
+<h3 id="operation">Operation</h3>
+<pre><code class="language-c++">dst = memory_load(128, addr + offset);
+</code></pre>
+<h2 id="__m128i-__lsx_vldx-void-addr-long-int-offset">__m128i __lsx_vldx (void * addr, long int offset)</h2>
+<h3 id="synopsis_1">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vldx (void * addr, long int offset)
+#include &lt;lsxintrin.h&gt;
+Instruction: vldx vr, r, r
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_1">Description</h3>
+<p>Read whole vector from memory address <code>addr + offset</code>, save the data into <code>dst</code>.  Note that you can use this intrinsic to load floating point vectors, even though the return type represents integer vectors.</p>
+<h3 id="operation_1">Operation</h3>
+<pre><code class="language-c++">dst = memory_load(128, addr + offset);
+</code></pre>
+<h2 id="__m128i-__lsx_vldrepl_b-void-addr-imm_n2048_2047-offset">__m128i __lsx_vldrepl_b (void * addr, imm_n2048_2047 offset)</h2>
+<h3 id="synopsis_2">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vldrepl_b (void * addr, imm_n2048_2047 offset)
+#include &lt;lsxintrin.h&gt;
+Instruction: vldrepl.b vr, r, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_2">Description</h3>
+<p>Read 8-bit data from memory address <code>addr + (offset &lt;&lt; 0)</code>, replicate the data to all vector lanes and save into <code>dst</code>.</p>
+<h3 id="operation_2">Operation</h3>
+<pre><code class="language-c++">u8 data = memory_load(8, addr + offset);
+for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = data;
+}
+</code></pre>
+<h2 id="__m128i-__lsx_vldrepl_h-void-addr-imm_n1024_1023-offset">__m128i __lsx_vldrepl_h (void * addr, imm_n1024_1023 offset)</h2>
+<h3 id="synopsis_3">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vldrepl_h (void * addr, imm_n1024_1023 offset)
+#include &lt;lsxintrin.h&gt;
+Instruction: vldrepl.h vr, r, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_3">Description</h3>
+<p>Read 16-bit data from memory address <code>addr + (offset &lt;&lt; 1)</code>, replicate the data to all vector lanes and save into <code>dst</code>.</p>
+<h3 id="operation_3">Operation</h3>
+<pre><code class="language-c++">u16 data = memory_load(16, addr + (offset &lt;&lt; 1));
+for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = data;
+}
+</code></pre>
+<h2 id="__m128i-__lsx_vldrepl_w-void-addr-imm_n512_511-offset">__m128i __lsx_vldrepl_w (void * addr, imm_n512_511 offset)</h2>
+<h3 id="synopsis_4">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vldrepl_w (void * addr, imm_n512_511 offset)
+#include &lt;lsxintrin.h&gt;
+Instruction: vldrepl.w vr, r, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_4">Description</h3>
+<p>Read 32-bit data from memory address <code>addr + (offset &lt;&lt; 2)</code>, replicate the data to all vector lanes and save into <code>dst</code>.</p>
+<h3 id="operation_4">Operation</h3>
+<pre><code class="language-c++">u32 data = memory_load(32, addr + (offset &lt;&lt; 2));
+for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = data;
+}
+</code></pre>
+<h2 id="__m128i-__lsx_vldrepl_d-void-addr-imm_n256_255-offset">__m128i __lsx_vldrepl_d (void * addr, imm_n256_255 offset)</h2>
+<h3 id="synopsis_5">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vldrepl_d (void * addr, imm_n256_255 offset)
+#include &lt;lsxintrin.h&gt;
+Instruction: vldrepl.d vr, r, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_5">Description</h3>
+<p>Read 64-bit data from memory address <code>addr + (offset &lt;&lt; 3)</code>, replicate the data to all vector lanes and save into <code>dst</code>.</p>
+<h3 id="operation_5">Operation</h3>
+<pre><code class="language-c++">u64 data = memory_load(64, addr + (offset &lt;&lt; 3));
+for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = data;
+}
+</code></pre>
+<h2 id="void-__lsx_vst-__m128i-data-void-addr-imm_n2048_2047-offset">void __lsx_vst (__m128i data, void * addr, imm_n2048_2047 offset)</h2>
+<h3 id="synopsis_6">Synopsis</h3>
+<pre><code class="language-c++">void __lsx_vst (__m128i data, void * addr, imm_n2048_2047 offset)
+#include &lt;lsxintrin.h&gt;
+Instruction: vst vr, r, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_6">Description</h3>
+<p>Write whole vector data in <code>data</code> to memory address <code>addr + offset</code>.</p>
+<h3 id="operation_6">Operation</h3>
+<pre><code class="language-c++">memory_store(128, data, addr + offset);
+</code></pre>
+<h2 id="void-__lsx_vstx-__m128i-data-void-addr-long-int-offset">void __lsx_vstx (__m128i data, void * addr, long int offset)</h2>
+<h3 id="synopsis_7">Synopsis</h3>
+<pre><code class="language-c++">void __lsx_vstx (__m128i data, void * addr, long int offset)
+#include &lt;lsxintrin.h&gt;
+Instruction: vstx vr, r, r
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_7">Description</h3>
+<p>Write whole-vector data in <code>data</code> to memory address <code>addr + offset</code>.</p>
+<h3 id="operation_7">Operation</h3>
+<pre><code class="language-c++">memory_store(128, data, addr + offset);
+</code></pre>
+<h2 id="void-__lsx_vstelm_b-__m128i-data-void-addr-imm_n128_127-offset-imm0_15-lane">void __lsx_vstelm_b (__m128i data, void * addr, imm_n128_127 offset, imm0_15 lane)</h2>
+<h3 id="synopsis_8">Synopsis</h3>
+<pre><code class="language-c++">void __lsx_vstelm_b (__m128i data, void * addr, imm_n128_127 offset, imm0_15 lane)
+#include &lt;lsxintrin.h&gt;
+Instruction: vstelm.b vr, r, imm, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_8">Description</h3>
+<p>Store the 8-bit element in <code>data</code> specified by <code>lane</code> to memory address <code>addr + offset</code>.</p>
+<h3 id="operation_8">Operation</h3>
+<pre><code class="language-c++">memory_store(8, data.byte[lane], addr + offset);
+</code></pre>
+<h2 id="void-__lsx_vstelm_h-__m128i-data-void-addr-imm_n128_127-offset-imm0_7-lane">void __lsx_vstelm_h (__m128i data, void * addr, imm_n128_127 offset, imm0_7 lane)</h2>
+<h3 id="synopsis_9">Synopsis</h3>
+<pre><code class="language-c++">void __lsx_vstelm_h (__m128i data, void * addr, imm_n128_127 offset, imm0_7 lane)
+#include &lt;lsxintrin.h&gt;
+Instruction: vstelm.h vr, r, imm, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_9">Description</h3>
+<p>Store the 16-bit element in <code>data</code> specified by <code>lane</code> to memory address <code>addr + offset</code>.</p>
+<h3 id="operation_9">Operation</h3>
+<pre><code class="language-c++">memory_store(16, data.half[lane], addr + offset);
+</code></pre>
+<h2 id="void-__lsx_vstelm_w-__m128i-data-void-addr-imm_n128_127-offset-imm0_3-lane">void __lsx_vstelm_w (__m128i data, void * addr, imm_n128_127 offset, imm0_3 lane)</h2>
+<h3 id="synopsis_10">Synopsis</h3>
+<pre><code class="language-c++">void __lsx_vstelm_w (__m128i data, void * addr, imm_n128_127 offset, imm0_3 lane)
+#include &lt;lsxintrin.h&gt;
+Instruction: vstelm.w vr, r, imm, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_10">Description</h3>
+<p>Store the 32-bit element in <code>data</code> specified by <code>lane</code> to memory address <code>addr + offset</code>.</p>
+<h3 id="operation_10">Operation</h3>
+<pre><code class="language-c++">memory_store(32, data.word[lane], addr + offset);
+</code></pre>
+<h2 id="void-__lsx_vstelm_d-__m128i-data-void-addr-imm_n128_127-offset-imm0_1-lane">void __lsx_vstelm_d (__m128i data, void * addr, imm_n128_127 offset, imm0_1 lane)</h2>
+<h3 id="synopsis_11">Synopsis</h3>
+<pre><code class="language-c++">void __lsx_vstelm_d (__m128i data, void * addr, imm_n128_127 offset, imm0_1 lane)
+#include &lt;lsxintrin.h&gt;
+Instruction: vstelm.d vr, r, imm, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_11">Description</h3>
+<p>Store the 64-bit element in <code>data</code> specified by <code>lane</code> to memory address <code>addr + offset</code>.</p>
+<h3 id="operation_11">Operation</h3>
+<pre><code class="language-c++">memory_store(64, data.dword[lane], addr + offset);
+</code></pre>
+              
+            </div>
+          </div><footer>
+    <div class="rst-footer-buttons" role="navigation" aria-label="Footer Navigation">
+        <a href="../logical/" class="btn btn-neutral float-left" title="Logical"><span class="icon icon-circle-arrow-left"></span> Previous</a>
+        <a href="../misc/" class="btn btn-neutral float-right" title="Misc">Next <span class="icon icon-circle-arrow-right"></span></a>
+    </div>
+
+  <hr/>
+
+  <div role="contentinfo">
+    <!-- Copyright etc -->
+      <p>Copyright &copy; 2023 Jiajie Chen</p>
+  </div>
+
+  Built with <a href="https://www.mkdocs.org/">MkDocs</a> using a <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
+</footer>
+          
+        </div>
+      </div>
+
+    </section>
+
+  </div>
+
+  <div class="rst-versions" role="note" aria-label="Versions">
+  <span class="rst-current-version" data-toggle="rst-current-version">
+    
+    
+      <span><a href="../logical/" style="color: #fcfcfc">&laquo; Previous</a></span>
+    
+    
+      <span><a href="../misc/" style="color: #fcfcfc">Next &raquo;</a></span>
+    
+  </span>
+</div>
+    <script src="../../js/jquery-3.6.0.min.js"></script>
+    <script>var base_url = "../..";</script>
+    <script src="../../js/theme_extra.js"></script>
+    <script src="../../js/theme.js"></script>
+    <script>
+        jQuery(function () {
+            SphinxRtdTheme.Navigation.enable(true);
+        });
+    </script>
+
+</body>
+</html>
diff --git a/lsx/misc/index.html b/lsx/misc/index.html
new file mode 100644
index 00000000..ba3d554f
--- /dev/null
+++ b/lsx/misc/index.html
@@ -0,0 +1,4669 @@
+<!DOCTYPE html>
+<html class="writer-html5" lang="en" >
+<head>
+    <meta charset="utf-8" />
+    <meta http-equiv="X-UA-Compatible" content="IE=edge" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" /><link rel="canonical" href="https://jia.je/unofficial-loongarch-intrinsics-guide/lsx/misc/" />
+      <link rel="shortcut icon" href="../../img/favicon.ico" />
+    <title>Misc - Unofficial LoongArch Intrinsics Guide</title>
+    <link rel="stylesheet" href="../../css/theme.css" />
+    <link rel="stylesheet" href="../../css/theme_extra.css" />
+        <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.8.0/styles/github.min.css" />
+        <link href="../../main.css" rel="stylesheet" />
+    
+      <script>
+        // Current page data
+        var mkdocs_page_name = "Misc";
+        var mkdocs_page_input_path = "lsx/misc.md";
+        var mkdocs_page_url = "/unofficial-loongarch-intrinsics-guide/lsx/misc/";
+      </script>
+    
+    <!--[if lt IE 9]>
+      <script src="../../js/html5shiv.min.js"></script>
+    <![endif]-->
+      <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.8.0/highlight.min.js"></script>
+      <script>hljs.highlightAll();</script> 
+</head>
+
+<body class="wy-body-for-nav" role="document">
+
+  <div class="wy-grid-for-nav">
+    <nav data-toggle="wy-nav-shift" class="wy-nav-side stickynav">
+    <div class="wy-side-scroll">
+      <div class="wy-side-nav-search">
+          <a href="../.." class="icon icon-home"> Unofficial LoongArch Intrinsics Guide
+        </a>
+      </div>
+
+      <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../..">Unofficial LoongArch Intrinsics Guide</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../latency_throughput/">Latency and Throughput of Instructions</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../migrating_avx/">Migrating from AVX to LASX</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../migrating_sse/">Migrating from SSE to LSX</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../viewer/">Browse All Intrinsics</a>
+                </li>
+              </ul>
+              <p class="caption"><span class="caption-text">Lasx</span></p>
+              <ul>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/bitwise_operations/">Bitwise Operations</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/branch/">Branch</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/float_comparison/">Floating Point Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/float_computation/">Floating Point Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/float_conversion/">Floating Point Conversion</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/float_misc/">Floating Point Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/fma/">Fused Multiply-Add</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/integer_comparison/">Integer Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/integer_computation/">Integer Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/logical/">Logical</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/memory/">Memory Load & Store</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/misc/">Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/permutation/">Permutation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/shift/">Shift</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/shuffling/">Shuffling</a>
+                  </li>
+              </ul>
+              <p class="caption"><span class="caption-text">Lsx</span></p>
+              <ul class="current">
+                  <li class="toctree-l1"><a class="reference internal" href="../bitwise_operations/">Bitwise Operations</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../branch/">Branch</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_comparison/">Floating Point Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_computation/">Floating Point Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_conversion/">Floating Point Conversion</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_misc/">Floating Point Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../fma/">Fused Multiply-Add</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../integer_comparison/">Integer Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../integer_computation/">Integer Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../logical/">Logical</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../memory/">Memory Load & Store</a>
+                  </li>
+                  <li class="toctree-l1 current"><a class="reference internal current" href="./">Misc</a>
+    <ul class="current">
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vexth_h_b-__m128i-a">__m128i __lsx_vexth_h_b (__m128i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vexth_hu_bu-__m128i-a">__m128i __lsx_vexth_hu_bu (__m128i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_1">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_1">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_1">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_1">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vexth_w_h-__m128i-a">__m128i __lsx_vexth_w_h (__m128i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_2">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_2">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_2">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_2">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vexth_wu_hu-__m128i-a">__m128i __lsx_vexth_wu_hu (__m128i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_3">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_3">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_3">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_3">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vexth_d_w-__m128i-a">__m128i __lsx_vexth_d_w (__m128i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_4">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_4">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_4">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_4">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vexth_du_wu-__m128i-a">__m128i __lsx_vexth_du_wu (__m128i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_5">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_5">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_5">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_5">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vexth_q_d-__m128i-a">__m128i __lsx_vexth_q_d (__m128i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_6">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_6">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_6">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_6">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vexth_qu_du-__m128i-a">__m128i __lsx_vexth_qu_du (__m128i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_7">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_7">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_7">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_7">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vextl_q_d-__m128i-a">__m128i __lsx_vextl_q_d (__m128i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_8">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_8">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_8">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_8">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vextl_qu_du-__m128i-a">__m128i __lsx_vextl_qu_du (__m128i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_9">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_9">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_9">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_9">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vextrins_b-__m128i-a-__m128i-b-imm0_255-imm">__m128i __lsx_vextrins_b (__m128i a, __m128i b, imm0_255 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_10">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_10">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_10">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_10">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vextrins_h-__m128i-a-__m128i-b-imm0_255-imm">__m128i __lsx_vextrins_h (__m128i a, __m128i b, imm0_255 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_11">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_11">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_11">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_11">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vextrins_w-__m128i-a-__m128i-b-imm0_255-imm">__m128i __lsx_vextrins_w (__m128i a, __m128i b, imm0_255 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_12">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_12">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_12">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_12">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vextrins_d-__m128i-a-__m128i-b-imm0_255-imm">__m128i __lsx_vextrins_d (__m128i a, __m128i b, imm0_255 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_13">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_13">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_13">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_13">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vilvh_b-__m128i-a-__m128i-b">__m128i __lsx_vilvh_b (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_14">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_14">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_14">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_14">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vilvh_h-__m128i-a-__m128i-b">__m128i __lsx_vilvh_h (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_15">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_15">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_15">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_15">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vilvh_w-__m128i-a-__m128i-b">__m128i __lsx_vilvh_w (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_16">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_16">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_16">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_16">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vilvh_d-__m128i-a-__m128i-b">__m128i __lsx_vilvh_d (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_17">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_17">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_17">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_17">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vilvl_b-__m128i-a-__m128i-b">__m128i __lsx_vilvl_b (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_18">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_18">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_18">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_18">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vilvl_h-__m128i-a-__m128i-b">__m128i __lsx_vilvl_h (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_19">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_19">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_19">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_19">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vilvl_w-__m128i-a-__m128i-b">__m128i __lsx_vilvl_w (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_20">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_20">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_20">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_20">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vilvl_d-__m128i-a-__m128i-b">__m128i __lsx_vilvl_d (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_21">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_21">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_21">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_21">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vinsgr2vr_b-__m128i-a-int-b-imm0_15-imm">__m128i __lsx_vinsgr2vr_b (__m128i a, int b, imm0_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_22">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_22">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_22">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_22">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vinsgr2vr_h-__m128i-a-int-b-imm0_7-imm">__m128i __lsx_vinsgr2vr_h (__m128i a, int b, imm0_7 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_23">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_23">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_23">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_23">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vinsgr2vr_w-__m128i-a-int-b-imm0_3-imm">__m128i __lsx_vinsgr2vr_w (__m128i a, int b, imm0_3 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_24">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_24">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_24">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_24">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vinsgr2vr_d-__m128i-a-long-int-b-imm0_1-imm">__m128i __lsx_vinsgr2vr_d (__m128i a, long int b, imm0_1 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_25">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_25">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_25">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_25">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vfrstp_b-__m128i-a-__m128i-b-__m128i-c">__m128i __lsx_vfrstp_b (__m128i a, __m128i b, __m128i c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_26">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_26">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_26">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_26">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vfrstp_h-__m128i-a-__m128i-b-__m128i-c">__m128i __lsx_vfrstp_h (__m128i a, __m128i b, __m128i c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_27">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_27">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_27">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_27">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vfrstpi_b-__m128i-a-__m128i-b-imm0_31-imm">__m128i __lsx_vfrstpi_b (__m128i a, __m128i b, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_28">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_28">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_28">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_28">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vfrstpi_h-__m128i-a-__m128i-b-imm0_31-imm">__m128i __lsx_vfrstpi_h (__m128i a, __m128i b, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_29">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_29">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_29">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_29">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmskgez_b-__m128i-a">__m128i __lsx_vmskgez_b (__m128i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_30">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_30">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_30">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_30">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmskltz_b-__m128i-a">__m128i __lsx_vmskltz_b (__m128i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_31">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_31">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_1">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_31">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_31">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmskltz_h-__m128i-a">__m128i __lsx_vmskltz_h (__m128i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_32">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_32">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_2">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_32">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_32">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmskltz_w-__m128i-a">__m128i __lsx_vmskltz_w (__m128i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_33">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_33">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_3">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_33">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_33">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmskltz_d-__m128i-a">__m128i __lsx_vmskltz_d (__m128i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_34">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_34">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_4">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_34">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_34">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vmsknz_b-__m128i-a">__m128i __lsx_vmsknz_b (__m128i a)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_35">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_35">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_5">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_35">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_35">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vpackev_b-__m128i-a-__m128i-b">__m128i __lsx_vpackev_b (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_36">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_36">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_36">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_36">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vpackev_h-__m128i-a-__m128i-b">__m128i __lsx_vpackev_h (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_37">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_37">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_37">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_37">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vpackev_w-__m128i-a-__m128i-b">__m128i __lsx_vpackev_w (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_38">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_38">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_38">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_38">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vpackev_d-__m128i-a-__m128i-b">__m128i __lsx_vpackev_d (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_39">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_39">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_39">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_39">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vpackod_b-__m128i-a-__m128i-b">__m128i __lsx_vpackod_b (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_40">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_40">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_40">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_40">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vpackod_h-__m128i-a-__m128i-b">__m128i __lsx_vpackod_h (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_41">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_41">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_41">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_41">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vpackod_w-__m128i-a-__m128i-b">__m128i __lsx_vpackod_w (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_42">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_42">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_42">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_42">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vpackod_d-__m128i-a-__m128i-b">__m128i __lsx_vpackod_d (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_43">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_43">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_43">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_43">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vpickev_b-__m128i-a-__m128i-b">__m128i __lsx_vpickev_b (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_44">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_44">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_44">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_44">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vpickev_h-__m128i-a-__m128i-b">__m128i __lsx_vpickev_h (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_45">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_45">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_45">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_45">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vpickev_w-__m128i-a-__m128i-b">__m128i __lsx_vpickev_w (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_46">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_46">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_46">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_46">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vpickev_d-__m128i-a-__m128i-b">__m128i __lsx_vpickev_d (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_47">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_47">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_47">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_47">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#int-__lsx_vpickve2gr_b-__m128i-a-imm0_15-idx">int __lsx_vpickve2gr_b (__m128i a, imm0_15 idx)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_48">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_48">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_48">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_48">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#unsigned-int-__lsx_vpickve2gr_bu-__m128i-a-imm0_15-idx">unsigned int __lsx_vpickve2gr_bu (__m128i a, imm0_15 idx)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_49">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_49">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_49">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_49">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#int-__lsx_vpickve2gr_h-__m128i-a-imm0_7-idx">int __lsx_vpickve2gr_h (__m128i a, imm0_7 idx)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_50">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_50">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_50">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_50">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#unsigned-int-__lsx_vpickve2gr_hu-__m128i-a-imm0_7-idx">unsigned int __lsx_vpickve2gr_hu (__m128i a, imm0_7 idx)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_51">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_51">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_51">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_51">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#int-__lsx_vpickve2gr_w-__m128i-a-imm0_3-idx">int __lsx_vpickve2gr_w (__m128i a, imm0_3 idx)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_52">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_52">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_52">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_52">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#unsigned-int-__lsx_vpickve2gr_wu-__m128i-a-imm0_3-idx">unsigned int __lsx_vpickve2gr_wu (__m128i a, imm0_3 idx)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_53">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_53">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_53">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_53">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#long-int-__lsx_vpickve2gr_d-__m128i-a-imm0_1-idx">long int __lsx_vpickve2gr_d (__m128i a, imm0_1 idx)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_54">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_54">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_54">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_54">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#unsigned-long-int-__lsx_vpickve2gr_du-__m128i-a-imm0_1-idx">unsigned long int __lsx_vpickve2gr_du (__m128i a, imm0_1 idx)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_55">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_55">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_55">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_55">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vpickod_b-__m128i-a-__m128i-b">__m128i __lsx_vpickod_b (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_56">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_56">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_56">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_56">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vpickod_h-__m128i-a-__m128i-b">__m128i __lsx_vpickod_h (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_57">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_57">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_57">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_57">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vpickod_w-__m128i-a-__m128i-b">__m128i __lsx_vpickod_w (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_58">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_58">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_58">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_58">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vpickod_d-__m128i-a-__m128i-b">__m128i __lsx_vpickod_d (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_59">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_59">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_59">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_59">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vrepli_b-imm_n512_511-imm">__m128i __lsx_vrepli_b (imm_n512_511 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_60">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_60">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_60">Operation</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vrepli_h-imm_n512_511-imm">__m128i __lsx_vrepli_h (imm_n512_511 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_61">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_61">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_61">Operation</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vrepli_w-imm_n512_511-imm">__m128i __lsx_vrepli_w (imm_n512_511 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_62">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_62">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_62">Operation</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vrepli_d-imm_n512_511-imm">__m128i __lsx_vrepli_d (imm_n512_511 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_63">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_63">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_63">Operation</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vreplgr2vr_b-int-val">__m128i __lsx_vreplgr2vr_b (int val)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_64">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_64">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_64">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_60">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vreplgr2vr_h-int-val">__m128i __lsx_vreplgr2vr_h (int val)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_65">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_65">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_65">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_61">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vreplgr2vr_w-int-val">__m128i __lsx_vreplgr2vr_w (int val)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_66">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_66">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_66">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_62">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vreplgr2vr_d-long-int-val">__m128i __lsx_vreplgr2vr_d (long int val)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_67">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_67">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_67">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_63">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vreplve_b-__m128i-a-int-idx">__m128i __lsx_vreplve_b (__m128i a, int idx)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_68">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_68">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_68">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_64">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vreplve_h-__m128i-a-int-idx">__m128i __lsx_vreplve_h (__m128i a, int idx)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_69">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_69">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_69">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_65">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vreplve_w-__m128i-a-int-idx">__m128i __lsx_vreplve_w (__m128i a, int idx)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_70">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_70">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_70">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_66">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vreplve_d-__m128i-a-int-idx">__m128i __lsx_vreplve_d (__m128i a, int idx)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_71">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_71">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_71">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_67">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vreplvei_b-__m128i-a-imm0_15-idx">__m128i __lsx_vreplvei_b (__m128i a, imm0_15 idx)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_72">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_72">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_72">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_68">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vreplvei_h-__m128i-a-imm0_7-idx">__m128i __lsx_vreplvei_h (__m128i a, imm0_7 idx)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_73">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_73">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_73">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_69">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vreplvei_w-__m128i-a-imm0_3-idx">__m128i __lsx_vreplvei_w (__m128i a, imm0_3 idx)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_74">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_74">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_74">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_70">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vreplvei_d-__m128i-a-imm0_1-idx">__m128i __lsx_vreplvei_d (__m128i a, imm0_1 idx)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_75">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_75">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_75">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_71">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsat_b-__m128i-a-imm0_7-imm">__m128i __lsx_vsat_b (__m128i a, imm0_7 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_76">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_76">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_76">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_72">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsat_bu-__m128i-a-imm0_7-imm">__m128i __lsx_vsat_bu (__m128i a, imm0_7 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_77">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_77">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_77">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_73">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsat_h-__m128i-a-imm0_15-imm">__m128i __lsx_vsat_h (__m128i a, imm0_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_78">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_78">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_78">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_74">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsat_hu-__m128i-a-imm0_15-imm">__m128i __lsx_vsat_hu (__m128i a, imm0_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_79">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_79">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_79">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_75">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsat_w-__m128i-a-imm0_31-imm">__m128i __lsx_vsat_w (__m128i a, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_80">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_80">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_80">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_76">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsat_wu-__m128i-a-imm0_31-imm">__m128i __lsx_vsat_wu (__m128i a, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_81">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_81">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_81">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_77">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsat_d-__m128i-a-imm0_63-imm">__m128i __lsx_vsat_d (__m128i a, imm0_63 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_82">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_82">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_82">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_78">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsat_du-__m128i-a-imm0_63-imm">__m128i __lsx_vsat_du (__m128i a, imm0_63 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_83">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_83">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_83">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_79">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsigncov_b-__m128i-a-__m128i-b">__m128i __lsx_vsigncov_b (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_84">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_84">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_84">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_80">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsigncov_h-__m128i-a-__m128i-b">__m128i __lsx_vsigncov_h (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_85">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_85">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_85">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_81">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsigncov_w-__m128i-a-__m128i-b">__m128i __lsx_vsigncov_w (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_86">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_86">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_86">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_82">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsigncov_d-__m128i-a-__m128i-b">__m128i __lsx_vsigncov_d (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_87">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_87">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_87">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_83">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vldi-imm_n1024_1023-imm">__m128i __lsx_vldi (imm_n1024_1023 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_88">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_88">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_88">Operation</a>
+    </li>
+        </ul>
+    </li>
+    </ul>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../permutation/">Permutation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../shift/">Shift</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../shuffling/">Shuffling</a>
+                  </li>
+              </ul>
+      </div>
+    </div>
+    </nav>
+
+    <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
+      <nav class="wy-nav-top" role="navigation" aria-label="Mobile navigation menu">
+          <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
+          <a href="../..">Unofficial LoongArch Intrinsics Guide</a>
+        
+      </nav>
+      <div class="wy-nav-content">
+        <div class="rst-content"><div role="navigation" aria-label="breadcrumbs navigation">
+  <ul class="wy-breadcrumbs">
+    <li><a href="../.." class="icon icon-home" aria-label="Docs"></a></li>
+          <li class="breadcrumb-item">Lsx</li>
+      <li class="breadcrumb-item active">Misc</li>
+    <li class="wy-breadcrumbs-aside">
+    </li>
+  </ul>
+  <hr/>
+</div>
+          <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
+            <div class="section" itemprop="articleBody">
+              
+                <h1 id="misc">Misc</h1>
+<h2 id="__m128i-__lsx_vexth_h_b-__m128i-a">__m128i __lsx_vexth_h_b (__m128i a)</h2>
+<h3 id="synopsis">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vexth_h_b (__m128i a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vexth.h.b vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description">Description</h3>
+<p>Extend signed 8-bit elements in the higher half of <code>a</code> to 16-bit.</p>
+<h3 id="operation">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = (s16)(s8)a.byte[8 + i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vexth_hu_bu-__m128i-a">__m128i __lsx_vexth_hu_bu (__m128i a)</h2>
+<h3 id="synopsis_1">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vexth_hu_bu (__m128i a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vexth.hu.bu vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_1">Description</h3>
+<p>Extend unsigned 8-bit elements in the higher half of <code>a</code> to 16-bit.</p>
+<h3 id="operation_1">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = (u16)(u8)a.byte[8 + i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_1">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vexth_w_h-__m128i-a">__m128i __lsx_vexth_w_h (__m128i a)</h2>
+<h3 id="synopsis_2">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vexth_w_h (__m128i a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vexth.w.h vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_2">Description</h3>
+<p>Extend signed 16-bit elements in the higher half of <code>a</code> to 32-bit.</p>
+<h3 id="operation_2">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = (s32)(s16)a.half[4 + i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_2">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vexth_wu_hu-__m128i-a">__m128i __lsx_vexth_wu_hu (__m128i a)</h2>
+<h3 id="synopsis_3">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vexth_wu_hu (__m128i a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vexth.wu.hu vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_3">Description</h3>
+<p>Extend unsigned 16-bit elements in the higher half of <code>a</code> to 32-bit.</p>
+<h3 id="operation_3">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = (u32)(u16)a.half[4 + i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_3">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vexth_d_w-__m128i-a">__m128i __lsx_vexth_d_w (__m128i a)</h2>
+<h3 id="synopsis_4">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vexth_d_w (__m128i a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vexth.d.w vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_4">Description</h3>
+<p>Extend signed 32-bit elements in the higher half of <code>a</code> to 64-bit.</p>
+<h3 id="operation_4">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = (s64)(s32)a.word[2 + i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_4">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vexth_du_wu-__m128i-a">__m128i __lsx_vexth_du_wu (__m128i a)</h2>
+<h3 id="synopsis_5">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vexth_du_wu (__m128i a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vexth.du.wu vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_5">Description</h3>
+<p>Extend unsigned 32-bit elements in the higher half of <code>a</code> to 64-bit.</p>
+<h3 id="operation_5">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = (u64)(u32)a.word[2 + i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_5">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vexth_q_d-__m128i-a">__m128i __lsx_vexth_q_d (__m128i a)</h2>
+<h3 id="synopsis_6">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vexth_q_d (__m128i a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vexth.q.d vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_6">Description</h3>
+<p>Extend signed 64-bit elements in the higher half of <code>a</code> to 128-bit.</p>
+<h3 id="operation_6">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 1; i++) {
+  dst.qword[i] = (s128)(s64)a.dword[1 + i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_6">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vexth_qu_du-__m128i-a">__m128i __lsx_vexth_qu_du (__m128i a)</h2>
+<h3 id="synopsis_7">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vexth_qu_du (__m128i a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vexth.qu.du vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_7">Description</h3>
+<p>Extend unsigned 64-bit elements in the higher half of <code>a</code> to 128-bit.</p>
+<h3 id="operation_7">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 1; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[1 + i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_7">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vextl_q_d-__m128i-a">__m128i __lsx_vextl_q_d (__m128i a)</h2>
+<h3 id="synopsis_8">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vextl_q_d (__m128i a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vextl.q.d vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_8">Description</h3>
+<p>Extend signed 64-bit elements in the lower half of <code>a</code> to 128-bit.</p>
+<h3 id="operation_8">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 1; i++) {
+  dst.qword[i] = (s128)(s64)a.dword[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_8">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vextl_qu_du-__m128i-a">__m128i __lsx_vextl_qu_du (__m128i a)</h2>
+<h3 id="synopsis_9">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vextl_qu_du (__m128i a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vextl.qu.du vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_9">Description</h3>
+<p>Extend unsigned 64-bit elements in the lower half of <code>a</code> to 128-bit.</p>
+<h3 id="operation_9">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 1; i++) {
+  dst.qword[i] = (u128)(u64)a.dword[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_9">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vextrins_b-__m128i-a-__m128i-b-imm0_255-imm">__m128i __lsx_vextrins_b (__m128i a, __m128i b, imm0_255 imm)</h2>
+<h3 id="synopsis_10">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vextrins_b (__m128i a, __m128i b, imm0_255 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vextrins.b vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_10">Description</h3>
+<p>Extract one 8-bit element in <code>b</code> and insert it to <code>a</code> according to <code>imm</code>.</p>
+<h3 id="operation_10">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = (i == ((imm &gt;&gt; 4) &amp; 15)) ? b.byte[imm &amp; 15] : a.byte[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_10">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vextrins_h-__m128i-a-__m128i-b-imm0_255-imm">__m128i __lsx_vextrins_h (__m128i a, __m128i b, imm0_255 imm)</h2>
+<h3 id="synopsis_11">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vextrins_h (__m128i a, __m128i b, imm0_255 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vextrins.h vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_11">Description</h3>
+<p>Extract one 16-bit element in <code>b</code> and insert it to <code>a</code> according to <code>imm</code>.</p>
+<h3 id="operation_11">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = (i == ((imm &gt;&gt; 4) &amp; 7)) ? b.half[imm &amp; 7] : a.half[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_11">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vextrins_w-__m128i-a-__m128i-b-imm0_255-imm">__m128i __lsx_vextrins_w (__m128i a, __m128i b, imm0_255 imm)</h2>
+<h3 id="synopsis_12">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vextrins_w (__m128i a, __m128i b, imm0_255 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vextrins.w vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_12">Description</h3>
+<p>Extract one 32-bit element in <code>b</code> and insert it to <code>a</code> according to <code>imm</code>.</p>
+<h3 id="operation_12">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = (i == ((imm &gt;&gt; 4) &amp; 3)) ? b.word[imm &amp; 3] : a.word[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_12">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vextrins_d-__m128i-a-__m128i-b-imm0_255-imm">__m128i __lsx_vextrins_d (__m128i a, __m128i b, imm0_255 imm)</h2>
+<h3 id="synopsis_13">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vextrins_d (__m128i a, __m128i b, imm0_255 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vextrins.d vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_13">Description</h3>
+<p>Extract one 64-bit element in <code>b</code> and insert it to <code>a</code> according to <code>imm</code>.</p>
+<h3 id="operation_13">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = (i == ((imm &gt;&gt; 4) &amp; 1)) ? b.dword[imm &amp; 1] : a.dword[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_13">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vilvh_b-__m128i-a-__m128i-b">__m128i __lsx_vilvh_b (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_14">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vilvh_b (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vilvh.b vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_14">Description</h3>
+<p>Interleave 8-bit elements in higher half of <code>a</code> and <code>b</code>.</p>
+<h3 id="operation_14">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = (i % 2 == 1) ? a.byte[i / 2 + 8] : b.byte[i / 2 + 8];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_14">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vilvh_h-__m128i-a-__m128i-b">__m128i __lsx_vilvh_h (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_15">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vilvh_h (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vilvh.h vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_15">Description</h3>
+<p>Interleave 16-bit elements in higher half of <code>a</code> and <code>b</code>.</p>
+<h3 id="operation_15">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = (i % 2 == 1) ? a.half[i / 2 + 4] : b.half[i / 2 + 4];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_15">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vilvh_w-__m128i-a-__m128i-b">__m128i __lsx_vilvh_w (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_16">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vilvh_w (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vilvh.w vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_16">Description</h3>
+<p>Interleave 32-bit elements in higher half of <code>a</code> and <code>b</code>.</p>
+<h3 id="operation_16">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = (i % 2 == 1) ? a.word[i / 2 + 2] : b.word[i / 2 + 2];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_16">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vilvh_d-__m128i-a-__m128i-b">__m128i __lsx_vilvh_d (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_17">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vilvh_d (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vilvh.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_17">Description</h3>
+<p>Interleave 64-bit elements in higher half of <code>a</code> and <code>b</code>.</p>
+<h3 id="operation_17">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = (i % 2 == 1) ? a.dword[i / 2 + 1] : b.dword[i / 2 + 1];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_17">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vilvl_b-__m128i-a-__m128i-b">__m128i __lsx_vilvl_b (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_18">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vilvl_b (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vilvl.b vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_18">Description</h3>
+<p>Interleave 8-bit elements in lower half of <code>a</code> and <code>b</code>.</p>
+<h3 id="operation_18">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = (i % 2 == 1) ? a.byte[i / 2] : b.byte[i / 2];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_18">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vilvl_h-__m128i-a-__m128i-b">__m128i __lsx_vilvl_h (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_19">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vilvl_h (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vilvl.h vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_19">Description</h3>
+<p>Interleave 16-bit elements in lower half of <code>a</code> and <code>b</code>.</p>
+<h3 id="operation_19">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = (i % 2 == 1) ? a.half[i / 2] : b.half[i / 2];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_19">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vilvl_w-__m128i-a-__m128i-b">__m128i __lsx_vilvl_w (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_20">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vilvl_w (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vilvl.w vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_20">Description</h3>
+<p>Interleave 32-bit elements in lower half of <code>a</code> and <code>b</code>.</p>
+<h3 id="operation_20">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = (i % 2 == 1) ? a.word[i / 2] : b.word[i / 2];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_20">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vilvl_d-__m128i-a-__m128i-b">__m128i __lsx_vilvl_d (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_21">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vilvl_d (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vilvl.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_21">Description</h3>
+<p>Interleave 64-bit elements in lower half of <code>a</code> and <code>b</code>.</p>
+<h3 id="operation_21">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = (i % 2 == 1) ? a.dword[i / 2] : b.dword[i / 2];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_21">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vinsgr2vr_b-__m128i-a-int-b-imm0_15-imm">__m128i __lsx_vinsgr2vr_b (__m128i a, int b, imm0_15 imm)</h2>
+<h3 id="synopsis_22">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vinsgr2vr_b (__m128i a, int b, imm0_15 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vinsgr2vr.b vr, r, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_22">Description</h3>
+<p>Insert 8-bit element into lane indexed <code>imm</code>.</p>
+<h3 id="operation_22">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = (i == imm) ? b : a.byte[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_22">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>1</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vinsgr2vr_h-__m128i-a-int-b-imm0_7-imm">__m128i __lsx_vinsgr2vr_h (__m128i a, int b, imm0_7 imm)</h2>
+<h3 id="synopsis_23">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vinsgr2vr_h (__m128i a, int b, imm0_7 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vinsgr2vr.h vr, r, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_23">Description</h3>
+<p>Insert 16-bit element into lane indexed <code>imm</code>.</p>
+<h3 id="operation_23">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = (i == imm) ? b : a.half[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_23">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>1</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vinsgr2vr_w-__m128i-a-int-b-imm0_3-imm">__m128i __lsx_vinsgr2vr_w (__m128i a, int b, imm0_3 imm)</h2>
+<h3 id="synopsis_24">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vinsgr2vr_w (__m128i a, int b, imm0_3 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vinsgr2vr.w vr, r, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_24">Description</h3>
+<p>Insert 32-bit element into lane indexed <code>imm</code>.</p>
+<h3 id="operation_24">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = (i == imm) ? b : a.word[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_24">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>1</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vinsgr2vr_d-__m128i-a-long-int-b-imm0_1-imm">__m128i __lsx_vinsgr2vr_d (__m128i a, long int b, imm0_1 imm)</h2>
+<h3 id="synopsis_25">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vinsgr2vr_d (__m128i a, long int b, imm0_1 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vinsgr2vr.d vr, r, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_25">Description</h3>
+<p>Insert 64-bit element into lane indexed <code>imm</code>.</p>
+<h3 id="operation_25">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = (i == imm) ? b : a.dword[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_25">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>1</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vfrstp_b-__m128i-a-__m128i-b-__m128i-c">__m128i __lsx_vfrstp_b (__m128i a, __m128i b, __m128i c)</h2>
+<h3 id="synopsis_26">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vfrstp_b (__m128i a, __m128i b, __m128i c)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfrstp.b vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_26">Description</h3>
+<p>Find the first negative 8-bit element in <code>b</code>, set the index of the element to the lane of <code>a</code> specified by <code>c</code>.</p>
+<h3 id="operation_26">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = a.byte[i];
+}
+int i;
+for (i = 0; i &lt; 16; i++) {
+  if ((s8)b.byte[i] &lt; 0) {
+    break;
+  }
+}
+dst.byte[c.byte[0] % 16] = i;
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_26">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vfrstp_h-__m128i-a-__m128i-b-__m128i-c">__m128i __lsx_vfrstp_h (__m128i a, __m128i b, __m128i c)</h2>
+<h3 id="synopsis_27">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vfrstp_h (__m128i a, __m128i b, __m128i c)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfrstp.h vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_27">Description</h3>
+<p>Find the first negative 16-bit element in <code>b</code>, set the index of the element to the lane of <code>a</code> specified by <code>c</code>.</p>
+<h3 id="operation_27">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = a.half[i];
+}
+int i;
+for (i = 0; i &lt; 8; i++) {
+  if ((s16)b.half[i] &lt; 0) {
+    break;
+  }
+}
+dst.half[c.half[0] % 8] = i;
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_27">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vfrstpi_b-__m128i-a-__m128i-b-imm0_31-imm">__m128i __lsx_vfrstpi_b (__m128i a, __m128i b, imm0_31 imm)</h2>
+<h3 id="synopsis_28">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vfrstpi_b (__m128i a, __m128i b, imm0_31 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfrstpi.b vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_28">Description</h3>
+<p>Find the first negative 8-bit element in <code>b</code>, set the index of the element to the lane of <code>a</code> specified by <code>imm</code>.</p>
+<h3 id="operation_28">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = a.byte[i];
+}
+int i;
+for (i = 0; i &lt; 16; i++) {
+  if ((s8)b.byte[i] &lt; 0) {
+    break;
+  }
+}
+dst.byte[imm % 16] = i;
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_28">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vfrstpi_h-__m128i-a-__m128i-b-imm0_31-imm">__m128i __lsx_vfrstpi_h (__m128i a, __m128i b, imm0_31 imm)</h2>
+<h3 id="synopsis_29">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vfrstpi_h (__m128i a, __m128i b, imm0_31 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vfrstpi.h vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_29">Description</h3>
+<p>Find the first negative 16-bit element in <code>b</code>, set the index of the element to the lane of <code>a</code> specified by <code>imm</code>.</p>
+<h3 id="operation_29">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = a.half[i];
+}
+int i;
+for (i = 0; i &lt; 8; i++) {
+  if ((s16)b.half[i] &lt; 0) {
+    break;
+  }
+}
+dst.half[imm % 8] = i;
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_29">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmskgez_b-__m128i-a">__m128i __lsx_vmskgez_b (__m128i a)</h2>
+<h3 id="synopsis_30">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmskgez_b (__m128i a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmskgez.b vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_30">Description</h3>
+<p>For each 8-bit element in <code>a</code>, if the element is greater than or equal to zero, set one bit in <code>dst</code>, otherwise clear it.</p>
+<h3 id="examples">Examples</h3>
+<pre><code class="language-c++">__m128i __lsx_vmskgez_b(__m128i{0x1122334455667788, 0x99aabbccddeeff00})
+= 0x00000000000001fe 0x0000000000000000
+__m128i __lsx_vmskgez_b(__m128i{0x0000808000000000, 0x0081000081716151})
+= 0x000000000000b7cf 0x0000000000000000
+</code></pre>
+<h3 id="operation_30">Operation</h3>
+<pre><code class="language-c++">u64 m = 0x8080808080808080;
+u64 c = m &amp; a.dword[0];
+c |= c &lt;&lt; 7;
+c |= c &lt;&lt; 14;
+c |= c &lt;&lt; 28;
+c &gt;&gt;= 56;
+dst.dword[0] = c;
+c = m &amp; a.dword[1];
+c |= c &lt;&lt; 7;
+c |= c &lt;&lt; 14;
+c |= c &lt;&lt; 28;
+c &gt;&gt;= 56;
+dst.dword[0] |= c &lt;&lt; 8;
+dst.dword[0] = (u16)~dst.dword[0];
+dst.dword[1] = 0;
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_30">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmskltz_b-__m128i-a">__m128i __lsx_vmskltz_b (__m128i a)</h2>
+<h3 id="synopsis_31">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmskltz_b (__m128i a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmskltz.b vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_31">Description</h3>
+<p>For each 8-bit element in <code>a</code>, if the element is less than zero, set one bit in <code>dst</code>, otherwise clear it.</p>
+<h3 id="examples_1">Examples</h3>
+<pre><code class="language-c++">__m128i __lsx_vmskltz_b(__m128i{0x1122334455667788, 0x99aabbccddeeff00})
+= 0x000000000000fe01 0x0000000000000000
+__m128i __lsx_vmskltz_b(__m128i{0x0000808000000000, 0x0081000081716151})
+= 0x0000000000004830 0x0000000000000000
+</code></pre>
+<h3 id="operation_31">Operation</h3>
+<pre><code class="language-c++">u64 m = 0x8080808080808080;
+u64 c = m &amp; a.dword[0];
+c |= c &lt;&lt; 7;
+c |= c &lt;&lt; 14;
+c |= c &lt;&lt; 28;
+c &gt;&gt;= 56;
+dst.dword[0] = c;
+c = m &amp; a.dword[1];
+c |= c &lt;&lt; 7;
+c |= c &lt;&lt; 14;
+c |= c &lt;&lt; 28;
+c &gt;&gt;= 56;
+dst.dword[0] |= c &lt;&lt; 8;
+dst.dword[1] = 0;
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_31">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmskltz_h-__m128i-a">__m128i __lsx_vmskltz_h (__m128i a)</h2>
+<h3 id="synopsis_32">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmskltz_h (__m128i a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmskltz.h vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_32">Description</h3>
+<p>For each 16-bit element in <code>a</code>, if the element is less than zero, set one bit in <code>dst</code>, otherwise clear it.</p>
+<h3 id="examples_2">Examples</h3>
+<pre><code class="language-c++">__m128i __lsx_vmskltz_h(__m128i{0x1122334455667788, 0x99aabbccddeeff00})
+= 0x00000000000000f0 0x0000000000000000
+__m128i __lsx_vmskltz_h(__m128i{0x0000808000000000, 0x0081000081716151})
+= 0x0000000000000024 0x0000000000000000
+</code></pre>
+<h3 id="operation_32">Operation</h3>
+<pre><code class="language-c++">u64 m = 0x8000800080008000;
+u64 c = m &amp; a.dword[0];
+c |= c &lt;&lt; 15;
+c |= c &lt;&lt; 30;
+c &gt;&gt;= 60;
+dst.dword[0] = c;
+c = m &amp; a.dword[1];
+c |= c &lt;&lt; 15;
+c |= c &lt;&lt; 30;
+c &gt;&gt;= 60;
+dst.dword[0] |= c &lt;&lt; 4;
+dst.dword[1] = 0;
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_32">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmskltz_w-__m128i-a">__m128i __lsx_vmskltz_w (__m128i a)</h2>
+<h3 id="synopsis_33">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmskltz_w (__m128i a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmskltz.w vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_33">Description</h3>
+<p>For each 32-bit element in <code>a</code>, if the element is less than zero, set one bit in <code>dst</code>, otherwise clear it.</p>
+<h3 id="examples_3">Examples</h3>
+<pre><code class="language-c++">__m128i __lsx_vmskltz_w(__m128i{0x1122334455667788, 0x99aabbccddeeff00})
+= 0x000000000000000c 0x0000000000000000
+__m128i __lsx_vmskltz_w(__m128i{0x0000808000000000, 0x0081000081716151})
+= 0x0000000000000004 0x0000000000000000
+</code></pre>
+<h3 id="operation_33">Operation</h3>
+<pre><code class="language-c++">u64 m = 0x8000000080000000;
+u64 c = m &amp; a.dword[0];
+c |= c &lt;&lt; 31;
+c &gt;&gt;= 62;
+dst.dword[0] = c;
+c = m &amp; a.dword[1];
+c |= c &lt;&lt; 31;
+c &gt;&gt;= 62;
+dst.dword[0] |= c &lt;&lt; 2;
+dst.dword[1] = 0;
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_33">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmskltz_d-__m128i-a">__m128i __lsx_vmskltz_d (__m128i a)</h2>
+<h3 id="synopsis_34">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmskltz_d (__m128i a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmskltz.d vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_34">Description</h3>
+<p>For each 64-bit element in <code>a</code>, if the element is less than zero, set one bit in <code>dst</code>, otherwise clear it.</p>
+<h3 id="examples_4">Examples</h3>
+<pre><code class="language-c++">__m128i __lsx_vmskltz_d(__m128i{0x1122334455667788, 0x99aabbccddeeff00})
+= 0x0000000000000002 0x0000000000000000
+__m128i __lsx_vmskltz_d(__m128i{0x0000808000000000, 0x0081000081716151})
+= 0x0000000000000000 0x0000000000000000
+</code></pre>
+<h3 id="operation_34">Operation</h3>
+<pre><code class="language-c++">u64 m = 0x8000000000000000;
+u64 c = m &amp; a.dword[0];
+c &gt;&gt;= 63;
+dst.dword[0] = c;
+c = m &amp; a.dword[1];
+c &gt;&gt;= 63;
+dst.dword[0] |= c &lt;&lt; 1;
+dst.dword[1] = 0;
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_34">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vmsknz_b-__m128i-a">__m128i __lsx_vmsknz_b (__m128i a)</h2>
+<h3 id="synopsis_35">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vmsknz_b (__m128i a)
+#include &lt;lsxintrin.h&gt;
+Instruction: vmsknz.b vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_35">Description</h3>
+<p>For each 8-bit element in <code>a</code>, if the element is non-zero, set one bit in <code>dst</code>, otherwise clear it.</p>
+<h3 id="examples_5">Examples</h3>
+<pre><code class="language-c++">__m128i __lsx_vmsknz_b(__m128i{0x1122334455667788, 0x99aabbccddeeff00})
+= 0x000000000000feff 0x0000000000000000
+__m128i __lsx_vmsknz_b(__m128i{0x0000111100000000, 0x0011000011111111})
+= 0x0000000000004f30 0x0000000000000000
+</code></pre>
+<h3 id="operation_35">Operation</h3>
+<pre><code class="language-c++">u64 m = 0x7F7F7F7F7F7F7F7F;
+u64 c = ~(((a.dword[0] &amp; m) + m) | a.dword[0] | m);
+c |= c &lt;&lt; 7;
+c |= c &lt;&lt; 14;
+c |= c &lt;&lt; 28;
+c &gt;&gt;= 56;
+dst.dword[0] = c;
+c = ~(((a.dword[1] &amp; m) + m) | a.dword[1] | m);
+c |= c &lt;&lt; 7;
+c |= c &lt;&lt; 14;
+c |= c &lt;&lt; 28;
+c &gt;&gt;= 56;
+dst.dword[0] |= c &lt;&lt; 8;
+dst.dword[0] = (u16)~dst.dword[0];
+dst.dword[1] = 0;
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_35">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vpackev_b-__m128i-a-__m128i-b">__m128i __lsx_vpackev_b (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_36">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vpackev_b (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vpackev.b vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_36">Description</h3>
+<p>Collect and pack even-positioned 8-bit elements in <code>a</code> and <code>b</code> and store <code>dst</code>.</p>
+<h3 id="operation_36">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = (i % 2 == 1) ? a.byte[i - 1] : b.byte[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_36">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vpackev_h-__m128i-a-__m128i-b">__m128i __lsx_vpackev_h (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_37">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vpackev_h (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vpackev.h vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_37">Description</h3>
+<p>Collect and pack even-positioned 16-bit elements in <code>a</code> and <code>b</code> and store <code>dst</code>.</p>
+<h3 id="operation_37">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = (i % 2 == 1) ? a.half[i - 1] : b.half[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_37">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vpackev_w-__m128i-a-__m128i-b">__m128i __lsx_vpackev_w (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_38">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vpackev_w (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vpackev.w vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_38">Description</h3>
+<p>Collect and pack even-positioned 32-bit elements in <code>a</code> and <code>b</code> and store <code>dst</code>.</p>
+<h3 id="operation_38">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = (i % 2 == 1) ? a.word[i - 1] : b.word[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_38">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vpackev_d-__m128i-a-__m128i-b">__m128i __lsx_vpackev_d (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_39">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vpackev_d (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vpackev.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_39">Description</h3>
+<p>Collect and pack even-positioned 64-bit elements in <code>a</code> and <code>b</code> and store <code>dst</code>.</p>
+<h3 id="operation_39">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = (i % 2 == 1) ? a.dword[i - 1] : b.dword[i];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_39">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vpackod_b-__m128i-a-__m128i-b">__m128i __lsx_vpackod_b (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_40">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vpackod_b (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vpackod.b vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_40">Description</h3>
+<p>Collect and pack odd-positioned 8-bit elements in <code>a</code> and <code>b</code> and store <code>dst</code>.</p>
+<h3 id="operation_40">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = (i % 2 == 1) ? a.byte[i] : b.byte[i + 1];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_40">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vpackod_h-__m128i-a-__m128i-b">__m128i __lsx_vpackod_h (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_41">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vpackod_h (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vpackod.h vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_41">Description</h3>
+<p>Collect and pack odd-positioned 16-bit elements in <code>a</code> and <code>b</code> and store <code>dst</code>.</p>
+<h3 id="operation_41">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = (i % 2 == 1) ? a.half[i] : b.half[i + 1];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_41">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vpackod_w-__m128i-a-__m128i-b">__m128i __lsx_vpackod_w (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_42">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vpackod_w (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vpackod.w vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_42">Description</h3>
+<p>Collect and pack odd-positioned 32-bit elements in <code>a</code> and <code>b</code> and store <code>dst</code>.</p>
+<h3 id="operation_42">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = (i % 2 == 1) ? a.word[i] : b.word[i + 1];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_42">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vpackod_d-__m128i-a-__m128i-b">__m128i __lsx_vpackod_d (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_43">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vpackod_d (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vpackod.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_43">Description</h3>
+<p>Collect and pack odd-positioned 64-bit elements in <code>a</code> and <code>b</code> and store <code>dst</code>.</p>
+<h3 id="operation_43">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = (i % 2 == 1) ? a.dword[i] : b.dword[i + 1];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_43">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vpickev_b-__m128i-a-__m128i-b">__m128i __lsx_vpickev_b (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_44">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vpickev_b (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vpickev.b vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_44">Description</h3>
+<p>Pick even-positioned 8-bit elements in <code>b</code> first, then pick even-positioned 8-bit elements in <code>a</code>.</p>
+<h3 id="operation_44">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = (i &lt; 8) ? b.byte[i * 2] : a.byte[(i - 8) * 2];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_44">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vpickev_h-__m128i-a-__m128i-b">__m128i __lsx_vpickev_h (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_45">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vpickev_h (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vpickev.h vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_45">Description</h3>
+<p>Pick even-positioned 16-bit elements in <code>b</code> first, then pick even-positioned 16-bit elements in <code>a</code>.</p>
+<h3 id="operation_45">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = (i &lt; 4) ? b.half[i * 2] : a.half[(i - 4) * 2];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_45">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vpickev_w-__m128i-a-__m128i-b">__m128i __lsx_vpickev_w (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_46">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vpickev_w (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vpickev.w vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_46">Description</h3>
+<p>Pick even-positioned 32-bit elements in <code>b</code> first, then pick even-positioned 32-bit elements in <code>a</code>.</p>
+<h3 id="operation_46">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = (i &lt; 2) ? b.word[i * 2] : a.word[(i - 2) * 2];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_46">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vpickev_d-__m128i-a-__m128i-b">__m128i __lsx_vpickev_d (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_47">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vpickev_d (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vpickev.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_47">Description</h3>
+<p>Pick even-positioned 64-bit elements in <code>b</code> first, then pick even-positioned 64-bit elements in <code>a</code>.</p>
+<h3 id="operation_47">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = (i &lt; 1) ? b.dword[i * 2] : a.dword[(i - 1) * 2];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_47">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="int-__lsx_vpickve2gr_b-__m128i-a-imm0_15-idx">int __lsx_vpickve2gr_b (__m128i a, imm0_15 idx)</h2>
+<h3 id="synopsis_48">Synopsis</h3>
+<pre><code class="language-c++">int __lsx_vpickve2gr_b (__m128i a, imm0_15 idx)
+#include &lt;lsxintrin.h&gt;
+Instruction: vpickve2gr.b r, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_48">Description</h3>
+<p>Pick the <code>lane</code> specified by <code>idx</code> from <code>a</code> and store into <code>dst</code>.</p>
+<h3 id="operation_48">Operation</h3>
+<pre><code class="language-c++">dst = (s8)a.byte[idx];
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_48">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>1</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="unsigned-int-__lsx_vpickve2gr_bu-__m128i-a-imm0_15-idx">unsigned int __lsx_vpickve2gr_bu (__m128i a, imm0_15 idx)</h2>
+<h3 id="synopsis_49">Synopsis</h3>
+<pre><code class="language-c++">unsigned int __lsx_vpickve2gr_bu (__m128i a, imm0_15 idx)
+#include &lt;lsxintrin.h&gt;
+Instruction: vpickve2gr.bu r, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_49">Description</h3>
+<p>Pick the <code>lane</code> specified by <code>idx</code> from <code>a</code> and store into <code>dst</code>.</p>
+<h3 id="operation_49">Operation</h3>
+<pre><code class="language-c++">dst = (u8)a.byte[idx];
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_49">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>1</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="int-__lsx_vpickve2gr_h-__m128i-a-imm0_7-idx">int __lsx_vpickve2gr_h (__m128i a, imm0_7 idx)</h2>
+<h3 id="synopsis_50">Synopsis</h3>
+<pre><code class="language-c++">int __lsx_vpickve2gr_h (__m128i a, imm0_7 idx)
+#include &lt;lsxintrin.h&gt;
+Instruction: vpickve2gr.h r, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_50">Description</h3>
+<p>Pick the <code>lane</code> specified by <code>idx</code> from <code>a</code> and store into <code>dst</code>.</p>
+<h3 id="operation_50">Operation</h3>
+<pre><code class="language-c++">dst = (s16)a.half[idx];
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_50">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>1</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="unsigned-int-__lsx_vpickve2gr_hu-__m128i-a-imm0_7-idx">unsigned int __lsx_vpickve2gr_hu (__m128i a, imm0_7 idx)</h2>
+<h3 id="synopsis_51">Synopsis</h3>
+<pre><code class="language-c++">unsigned int __lsx_vpickve2gr_hu (__m128i a, imm0_7 idx)
+#include &lt;lsxintrin.h&gt;
+Instruction: vpickve2gr.hu r, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_51">Description</h3>
+<p>Pick the <code>lane</code> specified by <code>idx</code> from <code>a</code> and store into <code>dst</code>.</p>
+<h3 id="operation_51">Operation</h3>
+<pre><code class="language-c++">dst = (u16)a.half[idx];
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_51">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>1</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="int-__lsx_vpickve2gr_w-__m128i-a-imm0_3-idx">int __lsx_vpickve2gr_w (__m128i a, imm0_3 idx)</h2>
+<h3 id="synopsis_52">Synopsis</h3>
+<pre><code class="language-c++">int __lsx_vpickve2gr_w (__m128i a, imm0_3 idx)
+#include &lt;lsxintrin.h&gt;
+Instruction: vpickve2gr.w r, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_52">Description</h3>
+<p>Pick the <code>lane</code> specified by <code>idx</code> from <code>a</code> and store into <code>dst</code>.</p>
+<h3 id="operation_52">Operation</h3>
+<pre><code class="language-c++">dst = (s32)a.word[idx];
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_52">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>1</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="unsigned-int-__lsx_vpickve2gr_wu-__m128i-a-imm0_3-idx">unsigned int __lsx_vpickve2gr_wu (__m128i a, imm0_3 idx)</h2>
+<h3 id="synopsis_53">Synopsis</h3>
+<pre><code class="language-c++">unsigned int __lsx_vpickve2gr_wu (__m128i a, imm0_3 idx)
+#include &lt;lsxintrin.h&gt;
+Instruction: vpickve2gr.wu r, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_53">Description</h3>
+<p>Pick the <code>lane</code> specified by <code>idx</code> from <code>a</code> and store into <code>dst</code>.</p>
+<h3 id="operation_53">Operation</h3>
+<pre><code class="language-c++">dst = (u32)a.word[idx];
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_53">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>1</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="long-int-__lsx_vpickve2gr_d-__m128i-a-imm0_1-idx">long int __lsx_vpickve2gr_d (__m128i a, imm0_1 idx)</h2>
+<h3 id="synopsis_54">Synopsis</h3>
+<pre><code class="language-c++">long int __lsx_vpickve2gr_d (__m128i a, imm0_1 idx)
+#include &lt;lsxintrin.h&gt;
+Instruction: vpickve2gr.d r, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_54">Description</h3>
+<p>Pick the <code>lane</code> specified by <code>idx</code> from <code>a</code> and store into <code>dst</code>.</p>
+<h3 id="operation_54">Operation</h3>
+<pre><code class="language-c++">dst = (s64)a.dword[idx];
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_54">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>1</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="unsigned-long-int-__lsx_vpickve2gr_du-__m128i-a-imm0_1-idx">unsigned long int __lsx_vpickve2gr_du (__m128i a, imm0_1 idx)</h2>
+<h3 id="synopsis_55">Synopsis</h3>
+<pre><code class="language-c++">unsigned long int __lsx_vpickve2gr_du (__m128i a, imm0_1 idx)
+#include &lt;lsxintrin.h&gt;
+Instruction: vpickve2gr.du r, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_55">Description</h3>
+<p>Pick the <code>lane</code> specified by <code>idx</code> from <code>a</code> and store into <code>dst</code>.</p>
+<h3 id="operation_55">Operation</h3>
+<pre><code class="language-c++">dst = (u64)a.dword[idx];
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_55">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>1</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vpickod_b-__m128i-a-__m128i-b">__m128i __lsx_vpickod_b (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_56">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vpickod_b (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vpickod.b vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_56">Description</h3>
+<p>Pick odd-positioned 8-bit elements in <code>b</code> first, then pick odd-positioned 8-bit elements in <code>a</code>.</p>
+<h3 id="operation_56">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = (i &lt; 8) ? b.byte[i * 2 + 1] : a.byte[(i - 8) * 2 + 1];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_56">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vpickod_h-__m128i-a-__m128i-b">__m128i __lsx_vpickod_h (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_57">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vpickod_h (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vpickod.h vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_57">Description</h3>
+<p>Pick odd-positioned 16-bit elements in <code>b</code> first, then pick odd-positioned 16-bit elements in <code>a</code>.</p>
+<h3 id="operation_57">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = (i &lt; 4) ? b.half[i * 2 + 1] : a.half[(i - 4) * 2 + 1];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_57">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vpickod_w-__m128i-a-__m128i-b">__m128i __lsx_vpickod_w (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_58">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vpickod_w (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vpickod.w vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_58">Description</h3>
+<p>Pick odd-positioned 32-bit elements in <code>b</code> first, then pick odd-positioned 32-bit elements in <code>a</code>.</p>
+<h3 id="operation_58">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = (i &lt; 2) ? b.word[i * 2 + 1] : a.word[(i - 2) * 2 + 1];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_58">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vpickod_d-__m128i-a-__m128i-b">__m128i __lsx_vpickod_d (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_59">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vpickod_d (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vpickod.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_59">Description</h3>
+<p>Pick odd-positioned 64-bit elements in <code>b</code> first, then pick odd-positioned 64-bit elements in <code>a</code>.</p>
+<h3 id="operation_59">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = (i &lt; 1) ? b.dword[i * 2 + 1] : a.dword[(i - 1) * 2 + 1];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_59">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vrepli_b-imm_n512_511-imm">__m128i __lsx_vrepli_b (imm_n512_511 imm)</h2>
+<h3 id="synopsis_60">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vrepli_b (imm_n512_511 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vldi vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_60">Description</h3>
+<p>Repeat <code>imm</code> to fill whole vector.</p>
+<h3 id="operation_60">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = imm;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h2 id="__m128i-__lsx_vrepli_h-imm_n512_511-imm">__m128i __lsx_vrepli_h (imm_n512_511 imm)</h2>
+<h3 id="synopsis_61">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vrepli_h (imm_n512_511 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vldi vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_61">Description</h3>
+<p>Repeat <code>imm</code> to fill whole vector.</p>
+<h3 id="operation_61">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = imm;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h2 id="__m128i-__lsx_vrepli_w-imm_n512_511-imm">__m128i __lsx_vrepli_w (imm_n512_511 imm)</h2>
+<h3 id="synopsis_62">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vrepli_w (imm_n512_511 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vldi vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_62">Description</h3>
+<p>Repeat <code>imm</code> to fill whole vector.</p>
+<h3 id="operation_62">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = imm;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h2 id="__m128i-__lsx_vrepli_d-imm_n512_511-imm">__m128i __lsx_vrepli_d (imm_n512_511 imm)</h2>
+<h3 id="synopsis_63">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vrepli_d (imm_n512_511 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vldi vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_63">Description</h3>
+<p>Repeat <code>imm</code> to fill whole vector.</p>
+<h3 id="operation_63">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = imm;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h2 id="__m128i-__lsx_vreplgr2vr_b-int-val">__m128i __lsx_vreplgr2vr_b (int val)</h2>
+<h3 id="synopsis_64">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vreplgr2vr_b (int val)
+#include &lt;lsxintrin.h&gt;
+Instruction: vreplgr2vr.b vr, r
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_64">Description</h3>
+<p>Repeat <code>val</code> to whole vector.</p>
+<h3 id="operation_64">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = val;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_60">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>N/A</td>
+<td>1</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>N/A</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vreplgr2vr_h-int-val">__m128i __lsx_vreplgr2vr_h (int val)</h2>
+<h3 id="synopsis_65">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vreplgr2vr_h (int val)
+#include &lt;lsxintrin.h&gt;
+Instruction: vreplgr2vr.h vr, r
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_65">Description</h3>
+<p>Repeat <code>val</code> to whole vector.</p>
+<h3 id="operation_65">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = val;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_61">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>N/A</td>
+<td>1</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>N/A</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vreplgr2vr_w-int-val">__m128i __lsx_vreplgr2vr_w (int val)</h2>
+<h3 id="synopsis_66">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vreplgr2vr_w (int val)
+#include &lt;lsxintrin.h&gt;
+Instruction: vreplgr2vr.w vr, r
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_66">Description</h3>
+<p>Repeat <code>val</code> to whole vector.</p>
+<h3 id="operation_66">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = val;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_62">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>N/A</td>
+<td>1</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>N/A</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vreplgr2vr_d-long-int-val">__m128i __lsx_vreplgr2vr_d (long int val)</h2>
+<h3 id="synopsis_67">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vreplgr2vr_d (long int val)
+#include &lt;lsxintrin.h&gt;
+Instruction: vreplgr2vr.d vr, r
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_67">Description</h3>
+<p>Repeat <code>val</code> to whole vector.</p>
+<h3 id="operation_67">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = val;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_63">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>N/A</td>
+<td>1</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>N/A</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vreplve_b-__m128i-a-int-idx">__m128i __lsx_vreplve_b (__m128i a, int idx)</h2>
+<h3 id="synopsis_68">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vreplve_b (__m128i a, int idx)
+#include &lt;lsxintrin.h&gt;
+Instruction: vreplve.b vr, vr, r
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_68">Description</h3>
+<p>Repeat the element in lane <code>idx</code> of <code>a</code> to fill whole vector.</p>
+<h3 id="operation_68">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = a.byte[idx % 16];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_64">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>1</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vreplve_h-__m128i-a-int-idx">__m128i __lsx_vreplve_h (__m128i a, int idx)</h2>
+<h3 id="synopsis_69">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vreplve_h (__m128i a, int idx)
+#include &lt;lsxintrin.h&gt;
+Instruction: vreplve.h vr, vr, r
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_69">Description</h3>
+<p>Repeat the element in lane <code>idx</code> of <code>a</code> to fill whole vector.</p>
+<h3 id="operation_69">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = a.half[idx % 8];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_65">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>1</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vreplve_w-__m128i-a-int-idx">__m128i __lsx_vreplve_w (__m128i a, int idx)</h2>
+<h3 id="synopsis_70">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vreplve_w (__m128i a, int idx)
+#include &lt;lsxintrin.h&gt;
+Instruction: vreplve.w vr, vr, r
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_70">Description</h3>
+<p>Repeat the element in lane <code>idx</code> of <code>a</code> to fill whole vector.</p>
+<h3 id="operation_70">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = a.word[idx % 4];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_66">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>1</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vreplve_d-__m128i-a-int-idx">__m128i __lsx_vreplve_d (__m128i a, int idx)</h2>
+<h3 id="synopsis_71">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vreplve_d (__m128i a, int idx)
+#include &lt;lsxintrin.h&gt;
+Instruction: vreplve.d vr, vr, r
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_71">Description</h3>
+<p>Repeat the element in lane <code>idx</code> of <code>a</code> to fill whole vector.</p>
+<h3 id="operation_71">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = a.dword[idx % 2];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_67">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>1</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vreplvei_b-__m128i-a-imm0_15-idx">__m128i __lsx_vreplvei_b (__m128i a, imm0_15 idx)</h2>
+<h3 id="synopsis_72">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vreplvei_b (__m128i a, imm0_15 idx)
+#include &lt;lsxintrin.h&gt;
+Instruction: vreplvei.b vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_72">Description</h3>
+<p>Repeat the element in lane <code>idx</code> of <code>a</code> to fill whole vector.</p>
+<h3 id="operation_72">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = a.byte[idx];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_68">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vreplvei_h-__m128i-a-imm0_7-idx">__m128i __lsx_vreplvei_h (__m128i a, imm0_7 idx)</h2>
+<h3 id="synopsis_73">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vreplvei_h (__m128i a, imm0_7 idx)
+#include &lt;lsxintrin.h&gt;
+Instruction: vreplvei.h vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_73">Description</h3>
+<p>Repeat the element in lane <code>idx</code> of <code>a</code> to fill whole vector.</p>
+<h3 id="operation_73">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = a.half[idx];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_69">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vreplvei_w-__m128i-a-imm0_3-idx">__m128i __lsx_vreplvei_w (__m128i a, imm0_3 idx)</h2>
+<h3 id="synopsis_74">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vreplvei_w (__m128i a, imm0_3 idx)
+#include &lt;lsxintrin.h&gt;
+Instruction: vreplvei.w vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_74">Description</h3>
+<p>Repeat the element in lane <code>idx</code> of <code>a</code> to fill whole vector.</p>
+<h3 id="operation_74">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = a.word[idx];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_70">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vreplvei_d-__m128i-a-imm0_1-idx">__m128i __lsx_vreplvei_d (__m128i a, imm0_1 idx)</h2>
+<h3 id="synopsis_75">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vreplvei_d (__m128i a, imm0_1 idx)
+#include &lt;lsxintrin.h&gt;
+Instruction: vreplvei.d vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_75">Description</h3>
+<p>Repeat the element in lane <code>idx</code> of <code>a</code> to fill whole vector.</p>
+<h3 id="operation_75">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = a.dword[idx];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_71">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsat_b-__m128i-a-imm0_7-imm">__m128i __lsx_vsat_b (__m128i a, imm0_7 imm)</h2>
+<h3 id="synopsis_76">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsat_b (__m128i a, imm0_7 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsat.b vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_76">Description</h3>
+<p>Clamp signed 8-bit elements in <code>a</code> to range specified by <code>imm</code>.</p>
+<h3 id="operation_76">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = clamp&lt;s8&gt;(a.byte[i], -(1 &lt;&lt; imm), (1 &lt;&lt; imm) - 1);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_72">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsat_bu-__m128i-a-imm0_7-imm">__m128i __lsx_vsat_bu (__m128i a, imm0_7 imm)</h2>
+<h3 id="synopsis_77">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsat_bu (__m128i a, imm0_7 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsat.bu vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_77">Description</h3>
+<p>Clamp unsigned 8-bit elements in <code>a</code> to range specified by <code>imm</code>.</p>
+<h3 id="operation_77">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = clamp&lt;u8&gt;(a.byte[i], 0, (1 &lt;&lt; (imm + 1)) - 1);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_73">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsat_h-__m128i-a-imm0_15-imm">__m128i __lsx_vsat_h (__m128i a, imm0_15 imm)</h2>
+<h3 id="synopsis_78">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsat_h (__m128i a, imm0_15 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsat.h vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_78">Description</h3>
+<p>Clamp signed 16-bit elements in <code>a</code> to range specified by <code>imm</code>.</p>
+<h3 id="operation_78">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = clamp&lt;s16&gt;(a.half[i], -(1 &lt;&lt; imm), (1 &lt;&lt; imm) - 1);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_74">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsat_hu-__m128i-a-imm0_15-imm">__m128i __lsx_vsat_hu (__m128i a, imm0_15 imm)</h2>
+<h3 id="synopsis_79">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsat_hu (__m128i a, imm0_15 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsat.hu vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_79">Description</h3>
+<p>Clamp unsigned 16-bit elements in <code>a</code> to range specified by <code>imm</code>.</p>
+<h3 id="operation_79">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = clamp&lt;u16&gt;(a.half[i], 0, (1 &lt;&lt; (imm + 1)) - 1);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_75">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsat_w-__m128i-a-imm0_31-imm">__m128i __lsx_vsat_w (__m128i a, imm0_31 imm)</h2>
+<h3 id="synopsis_80">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsat_w (__m128i a, imm0_31 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsat.w vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_80">Description</h3>
+<p>Clamp signed 32-bit elements in <code>a</code> to range specified by <code>imm</code>.</p>
+<h3 id="operation_80">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = clamp&lt;s32&gt;(a.word[i], -(1 &lt;&lt; imm), (1 &lt;&lt; imm) - 1);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_76">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsat_wu-__m128i-a-imm0_31-imm">__m128i __lsx_vsat_wu (__m128i a, imm0_31 imm)</h2>
+<h3 id="synopsis_81">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsat_wu (__m128i a, imm0_31 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsat.wu vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_81">Description</h3>
+<p>Clamp unsigned 32-bit elements in <code>a</code> to range specified by <code>imm</code>.</p>
+<h3 id="operation_81">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = clamp&lt;u32&gt;(a.word[i], 0, (1 &lt;&lt; (imm + 1)) - 1);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_77">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsat_d-__m128i-a-imm0_63-imm">__m128i __lsx_vsat_d (__m128i a, imm0_63 imm)</h2>
+<h3 id="synopsis_82">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsat_d (__m128i a, imm0_63 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsat.d vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_82">Description</h3>
+<p>Clamp signed 64-bit elements in <code>a</code> to range specified by <code>imm</code>.</p>
+<h3 id="operation_82">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = clamp&lt;s64&gt;(a.dword[i], -(1 &lt;&lt; imm), (1 &lt;&lt; imm) - 1);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_78">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsat_du-__m128i-a-imm0_63-imm">__m128i __lsx_vsat_du (__m128i a, imm0_63 imm)</h2>
+<h3 id="synopsis_83">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsat_du (__m128i a, imm0_63 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsat.du vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_83">Description</h3>
+<p>Clamp unsigned 64-bit elements in <code>a</code> to range specified by <code>imm</code>.</p>
+<h3 id="operation_83">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = clamp&lt;u64&gt;(a.dword[i], 0, (1 &lt;&lt; (imm + 1)) - 1);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_79">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsigncov_b-__m128i-a-__m128i-b">__m128i __lsx_vsigncov_b (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_84">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsigncov_b (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsigncov.b vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_84">Description</h3>
+<p>If the 8-bit element in <code>a</code> equals to zero, set the result to zero. If the signed 8-bit element in <code>a</code> is positive, copy element in <code>b</code> to result. Otherwise, copy negated element in <code>b</code> to result. If <code>a</code> and <code>b</code> are the same vectors, it is equivalent to computing absolute value.</p>
+<h3 id="operation_84">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] =
+      (a.byte[i] == 0) ? 0 : ((s8)a.byte[i] &gt; 0 ? b.byte[i] : -b.byte[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_80">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsigncov_h-__m128i-a-__m128i-b">__m128i __lsx_vsigncov_h (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_85">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsigncov_h (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsigncov.h vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_85">Description</h3>
+<p>If the 16-bit element in <code>a</code> equals to zero, set the result to zero. If the signed 16-bit element in <code>a</code> is positive, copy element in <code>b</code> to result. Otherwise, copy negated element in <code>b</code> to result. If <code>a</code> and <code>b</code> are the same vectors, it is equivalent to computing absolute value.</p>
+<h3 id="operation_85">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] =
+      (a.half[i] == 0) ? 0 : ((s16)a.half[i] &gt; 0 ? b.half[i] : -b.half[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_81">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsigncov_w-__m128i-a-__m128i-b">__m128i __lsx_vsigncov_w (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_86">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsigncov_w (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsigncov.w vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_86">Description</h3>
+<p>If the 32-bit element in <code>a</code> equals to zero, set the result to zero. If the signed 32-bit element in <code>a</code> is positive, copy element in <code>b</code> to result. Otherwise, copy negated element in <code>b</code> to result. If <code>a</code> and <code>b</code> are the same vectors, it is equivalent to computing absolute value.</p>
+<h3 id="operation_86">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] =
+      (a.word[i] == 0) ? 0 : ((s32)a.word[i] &gt; 0 ? b.word[i] : -b.word[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_82">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsigncov_d-__m128i-a-__m128i-b">__m128i __lsx_vsigncov_d (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_87">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsigncov_d (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsigncov.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_87">Description</h3>
+<p>If the 64-bit element in <code>a</code> equals to zero, set the result to zero. If the signed 64-bit element in <code>a</code> is positive, copy element in <code>b</code> to result. Otherwise, copy negated element in <code>b</code> to result. If <code>a</code> and <code>b</code> are the same vectors, it is equivalent to computing absolute value.</p>
+<h3 id="operation_87">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] =
+      (a.dword[i] == 0) ? 0 : ((s64)a.dword[i] &gt; 0 ? b.dword[i] : -b.dword[i]);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_83">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vldi-imm_n1024_1023-imm">__m128i __lsx_vldi (imm_n1024_1023 imm)</h2>
+<h3 id="synopsis_88">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vldi (imm_n1024_1023 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vldi vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_88">Description</h3>
+<p>Initialize <code>dst</code> using predefined patterns:</p>
+<ul>
+<li><code>imm[12:10]=0b000</code>: broadcast <code>imm[7:0]</code> as 8-bit elements to all lanes</li>
+<li><code>imm[12:10]=0b001</code>: broadcast sign-extended <code>imm[9:0]</code> as 16-bit elements to all lanes</li>
+<li><code>imm[12:10]=0b010</code>: broadcast sign-extended <code>imm[9:0]</code> as 32-bit elements to all lanes</li>
+<li><code>imm[12:10]=0b011</code>: broadcast sign-extended <code>imm[9:0]</code> as 64-bit elements to all lanes</li>
+<li><code>imm[12:8]=0b10000</code>: broadcast <code>imm[7:0]</code> as 32-bit elements to all lanes</li>
+<li><code>imm[12:8]=0b10001</code>: broadcast <code>imm[7:0] &lt;&lt; 8</code> as 32-bit elements to all lanes</li>
+<li><code>imm[12:8]=0b10010</code>: broadcast <code>imm[7:0] &lt;&lt; 16</code> as 32-bit elements to all lanes</li>
+<li><code>imm[12:8]=0b10011</code>: broadcast <code>imm[7:0] &lt;&lt; 24</code> as 32-bit elements to all lanes</li>
+<li><code>imm[12:8]=0b10100</code>: broadcast <code>imm[7:0]</code> as 16-bit elements to all lanes</li>
+<li><code>imm[12:8]=0b10101</code>: broadcast <code>imm[7:0] &lt;&lt; 8</code> as 16-bit elements to all lanes</li>
+<li><code>imm[12:8]=0b10110</code>: broadcast <code>(imm[7:0] &lt;&lt; 8) | 0xFF</code> as 32-bit elements to all lanes</li>
+<li><code>imm[12:8]=0b10111</code>: broadcast <code>(imm[7:0] &lt;&lt; 16) | 0xFFFF</code> as 32-bit elements to all lanes</li>
+<li><code>imm[12:8]=0b11000</code>: broadcast <code>imm[7:0]</code> as 8-bit elements to all lanes</li>
+<li><code>imm[12:8]=0b11001</code>: repeat each bit of <code>imm[7:0]</code> eight times, and broadcast the result as 64-bit elements to all lanes</li>
+<li><code>imm[12:8]=0b11010</code>: broadcast <code>(imm[7] &lt;&lt; 31) | ((1-imm[6]) &lt;&lt; 30) | ((imm[6] * 0x1F) &lt;&lt; 25) | (imm[5:0] &lt;&lt; 19)</code> as 32-bit elements to all lanes</li>
+<li><code>imm[12:8]=0b11011</code>: broadcast <code>(imm[7] &lt;&lt; 31) | ((1-imm[6]) &lt;&lt; 30) | ((imm[6] * 0x1F) &lt;&lt; 25) | (imm[5:0] &lt;&lt; 19)</code> as 64-bit elements to all lanes</li>
+<li><code>imm[12:8]=0b11100</code>: broadcast <code>(imm[7] &lt;&lt; 63) | ((1-imm[6]) &lt;&lt; 62) | ((imm[6] * 0xFF) &lt;&lt; 54) | (imm[5:0] &lt;&lt; 48)</code> as 64-bit elements to all lanes</li>
+</ul>
+<h3 id="operation_88">Operation</h3>
+<pre><code class="language-c++">u64 imm12_10 = (imm &gt;&gt; 10) &amp; 0b111;
+u64 imm12_8 = (imm &gt;&gt; 8) &amp; 0b11111;
+u64 imm9_0 = imm &amp; 0x3FF;
+s64 simm9_0 = ((s64)imm9_0 &lt;&lt; 54) &gt;&gt; 54;
+u64 imm7_0 = imm &amp; 0xFF;
+u64 imm7 = (imm &gt;&gt; 7) &amp; 0x1;
+u64 imm6 = (imm &gt;&gt; 6) &amp; 0x1;
+u64 imm5 = (imm &gt;&gt; 5) &amp; 0x1;
+u64 imm5_0 = imm &amp; 0x3F;
+u64 imm4 = (imm &gt;&gt; 4) &amp; 0x1;
+u64 imm3 = (imm &gt;&gt; 3) &amp; 0x1;
+u64 imm2 = (imm &gt;&gt; 2) &amp; 0x1;
+u64 imm1 = (imm &gt;&gt; 1) &amp; 0x1;
+u64 imm0 = imm &amp; 0x1;
+
+u64 broadcast_value;
+u64 broadcast_width;
+if (imm12_10 == 0b000) {
+  broadcast_value = imm7_0;
+  broadcast_width = 8;
+} else if (imm12_10 == 0b001) {
+  broadcast_value = simm9_0;
+  broadcast_width = 16;
+} else if (imm12_10 == 0b010) {
+  broadcast_value = simm9_0;
+  broadcast_width = 32;
+} else if (imm12_10 == 0b011) {
+  broadcast_value = simm9_0;
+  broadcast_width = 64;
+} else if (imm12_8 == 0b10000) {
+  broadcast_value = imm7_0;
+  broadcast_width = 32;
+} else if (imm12_8 == 0b10001) {
+  broadcast_value = imm7_0 &lt;&lt; 8;
+  broadcast_width = 32;
+} else if (imm12_8 == 0b10010) {
+  broadcast_value = imm7_0 &lt;&lt; 16;
+  broadcast_width = 32;
+} else if (imm12_8 == 0b10011) {
+  broadcast_value = imm7_0 &lt;&lt; 24;
+  broadcast_width = 32;
+} else if (imm12_8 == 0b10100) {
+  broadcast_value = imm7_0;
+  broadcast_width = 16;
+} else if (imm12_8 == 0b10101) {
+  broadcast_value = imm7_0 &lt;&lt; 8;
+  broadcast_width = 16;
+} else if (imm12_8 == 0b10110) {
+  broadcast_value = (imm7_0 &lt;&lt; 8) | 0xFF;
+  broadcast_width = 32;
+} else if (imm12_8 == 0b10111) {
+  broadcast_value = (imm7_0 &lt;&lt; 16) | 0xFFFF;
+  broadcast_width = 32;
+} else if (imm12_8 == 0b11000) {
+  broadcast_value = imm7_0;
+  broadcast_width = 8;
+} else if (imm12_8 == 0b11001) {
+  broadcast_value = imm0 * 0xFF + imm1 * 0xFF00 + imm2 * 0xFF0000 +
+                    imm3 * 0xFF000000 + imm4 * 0xFF00000000 +
+                    imm5 * 0xFF0000000000 + imm6 * 0xFF000000000000 +
+                    imm7 * 0xFF00000000000000;
+  broadcast_width = 64;
+} else if (imm12_8 == 0b11010) {
+  broadcast_value = (imm7 &lt;&lt; 31) | ((1 - imm6) &lt;&lt; 30) | ((imm6 * 0x1F) &lt;&lt; 25) |
+                    (imm5_0 &lt;&lt; 19);
+  broadcast_width = 32;
+} else if (imm12_8 == 0b11011) {
+  broadcast_value = (imm7 &lt;&lt; 31) | ((1 - imm6) &lt;&lt; 30) | ((imm6 * 0x1F) &lt;&lt; 25) |
+                    (imm5_0 &lt;&lt; 19);
+  broadcast_width = 64;
+} else if (imm12_8 == 0b11100) {
+  broadcast_value = (imm7 &lt;&lt; 63) | ((1 - imm6) &lt;&lt; 62) | ((imm6 * 0xFF) &lt;&lt; 54) |
+                    (imm5_0 &lt;&lt; 48);
+  broadcast_width = 64;
+}
+
+if (broadcast_width == 8) {
+  for (int i = 0; i &lt; 16; i++) {
+    dst.byte[i] = broadcast_value;
+  }
+} else if (broadcast_width == 16) {
+  for (int i = 0; i &lt; 8; i++) {
+    dst.half[i] = broadcast_value;
+  }
+} else if (broadcast_width == 32) {
+  for (int i = 0; i &lt; 4; i++) {
+    dst.word[i] = broadcast_value;
+  }
+} else if (broadcast_width == 64) {
+  for (int i = 0; i &lt; 2; i++) {
+    dst.dword[i] = broadcast_value;
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+              
+            </div>
+          </div><footer>
+    <div class="rst-footer-buttons" role="navigation" aria-label="Footer Navigation">
+        <a href="../memory/" class="btn btn-neutral float-left" title="Memory Load & Store"><span class="icon icon-circle-arrow-left"></span> Previous</a>
+        <a href="../permutation/" class="btn btn-neutral float-right" title="Permutation">Next <span class="icon icon-circle-arrow-right"></span></a>
+    </div>
+
+  <hr/>
+
+  <div role="contentinfo">
+    <!-- Copyright etc -->
+      <p>Copyright &copy; 2023 Jiajie Chen</p>
+  </div>
+
+  Built with <a href="https://www.mkdocs.org/">MkDocs</a> using a <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
+</footer>
+          
+        </div>
+      </div>
+
+    </section>
+
+  </div>
+
+  <div class="rst-versions" role="note" aria-label="Versions">
+  <span class="rst-current-version" data-toggle="rst-current-version">
+    
+    
+      <span><a href="../memory/" style="color: #fcfcfc">&laquo; Previous</a></span>
+    
+    
+      <span><a href="../permutation/" style="color: #fcfcfc">Next &raquo;</a></span>
+    
+  </span>
+</div>
+    <script src="../../js/jquery-3.6.0.min.js"></script>
+    <script>var base_url = "../..";</script>
+    <script src="../../js/theme_extra.js"></script>
+    <script src="../../js/theme.js"></script>
+    <script>
+        jQuery(function () {
+            SphinxRtdTheme.Navigation.enable(true);
+        });
+    </script>
+
+</body>
+</html>
diff --git a/lsx/permutation/index.html b/lsx/permutation/index.html
new file mode 100644
index 00000000..41aacfdf
--- /dev/null
+++ b/lsx/permutation/index.html
@@ -0,0 +1,249 @@
+<!DOCTYPE html>
+<html class="writer-html5" lang="en" >
+<head>
+    <meta charset="utf-8" />
+    <meta http-equiv="X-UA-Compatible" content="IE=edge" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" /><link rel="canonical" href="https://jia.je/unofficial-loongarch-intrinsics-guide/lsx/permutation/" />
+      <link rel="shortcut icon" href="../../img/favicon.ico" />
+    <title>Permutation - Unofficial LoongArch Intrinsics Guide</title>
+    <link rel="stylesheet" href="../../css/theme.css" />
+    <link rel="stylesheet" href="../../css/theme_extra.css" />
+        <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.8.0/styles/github.min.css" />
+        <link href="../../main.css" rel="stylesheet" />
+    
+      <script>
+        // Current page data
+        var mkdocs_page_name = "Permutation";
+        var mkdocs_page_input_path = "lsx/permutation.md";
+        var mkdocs_page_url = "/unofficial-loongarch-intrinsics-guide/lsx/permutation/";
+      </script>
+    
+    <!--[if lt IE 9]>
+      <script src="../../js/html5shiv.min.js"></script>
+    <![endif]-->
+      <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.8.0/highlight.min.js"></script>
+      <script>hljs.highlightAll();</script> 
+</head>
+
+<body class="wy-body-for-nav" role="document">
+
+  <div class="wy-grid-for-nav">
+    <nav data-toggle="wy-nav-shift" class="wy-nav-side stickynav">
+    <div class="wy-side-scroll">
+      <div class="wy-side-nav-search">
+          <a href="../.." class="icon icon-home"> Unofficial LoongArch Intrinsics Guide
+        </a>
+      </div>
+
+      <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../..">Unofficial LoongArch Intrinsics Guide</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../latency_throughput/">Latency and Throughput of Instructions</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../migrating_avx/">Migrating from AVX to LASX</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../migrating_sse/">Migrating from SSE to LSX</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../viewer/">Browse All Intrinsics</a>
+                </li>
+              </ul>
+              <p class="caption"><span class="caption-text">Lasx</span></p>
+              <ul>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/bitwise_operations/">Bitwise Operations</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/branch/">Branch</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/float_comparison/">Floating Point Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/float_computation/">Floating Point Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/float_conversion/">Floating Point Conversion</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/float_misc/">Floating Point Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/fma/">Fused Multiply-Add</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/integer_comparison/">Integer Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/integer_computation/">Integer Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/logical/">Logical</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/memory/">Memory Load & Store</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/misc/">Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/permutation/">Permutation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/shift/">Shift</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/shuffling/">Shuffling</a>
+                  </li>
+              </ul>
+              <p class="caption"><span class="caption-text">Lsx</span></p>
+              <ul class="current">
+                  <li class="toctree-l1"><a class="reference internal" href="../bitwise_operations/">Bitwise Operations</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../branch/">Branch</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_comparison/">Floating Point Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_computation/">Floating Point Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_conversion/">Floating Point Conversion</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_misc/">Floating Point Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../fma/">Fused Multiply-Add</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../integer_comparison/">Integer Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../integer_computation/">Integer Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../logical/">Logical</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../memory/">Memory Load & Store</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../misc/">Misc</a>
+                  </li>
+                  <li class="toctree-l1 current"><a class="reference internal current" href="./">Permutation</a>
+    <ul class="current">
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vpermi_w-__m128i-a-__m128i-b-imm0_255-imm">__m128i __lsx_vpermi_w (__m128i a, __m128i b, imm0_255 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    </ul>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../shift/">Shift</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../shuffling/">Shuffling</a>
+                  </li>
+              </ul>
+      </div>
+    </div>
+    </nav>
+
+    <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
+      <nav class="wy-nav-top" role="navigation" aria-label="Mobile navigation menu">
+          <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
+          <a href="../..">Unofficial LoongArch Intrinsics Guide</a>
+        
+      </nav>
+      <div class="wy-nav-content">
+        <div class="rst-content"><div role="navigation" aria-label="breadcrumbs navigation">
+  <ul class="wy-breadcrumbs">
+    <li><a href="../.." class="icon icon-home" aria-label="Docs"></a></li>
+          <li class="breadcrumb-item">Lsx</li>
+      <li class="breadcrumb-item active">Permutation</li>
+    <li class="wy-breadcrumbs-aside">
+    </li>
+  </ul>
+  <hr/>
+</div>
+          <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
+            <div class="section" itemprop="articleBody">
+              
+                <h1 id="permutation">Permutation</h1>
+<h2 id="__m128i-__lsx_vpermi_w-__m128i-a-__m128i-b-imm0_255-imm">__m128i __lsx_vpermi_w (__m128i a, __m128i b, imm0_255 imm)</h2>
+<h3 id="synopsis">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vpermi_w (__m128i a, __m128i b, imm0_255 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vpermi.w vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description">Description</h3>
+<p>Permute words from <code>a</code> and <code>b</code> with indices recorded in <code>imm</code> and store into <code>dst</code>.</p>
+<h3 id="operation">Operation</h3>
+<pre><code class="language-c++">dst.word[0] = b.word[imm &amp; 0x3];
+dst.word[1] = b.word[(imm &gt;&gt; 2) &amp; 0x3];
+dst.word[2] = a.word[(imm &gt;&gt; 4) &amp; 0x3];
+dst.word[3] = a.word[(imm &gt;&gt; 6) &amp; 0x3];
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+              
+            </div>
+          </div><footer>
+    <div class="rst-footer-buttons" role="navigation" aria-label="Footer Navigation">
+        <a href="../misc/" class="btn btn-neutral float-left" title="Misc"><span class="icon icon-circle-arrow-left"></span> Previous</a>
+        <a href="../shift/" class="btn btn-neutral float-right" title="Shift">Next <span class="icon icon-circle-arrow-right"></span></a>
+    </div>
+
+  <hr/>
+
+  <div role="contentinfo">
+    <!-- Copyright etc -->
+      <p>Copyright &copy; 2023 Jiajie Chen</p>
+  </div>
+
+  Built with <a href="https://www.mkdocs.org/">MkDocs</a> using a <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
+</footer>
+          
+        </div>
+      </div>
+
+    </section>
+
+  </div>
+
+  <div class="rst-versions" role="note" aria-label="Versions">
+  <span class="rst-current-version" data-toggle="rst-current-version">
+    
+    
+      <span><a href="../misc/" style="color: #fcfcfc">&laquo; Previous</a></span>
+    
+    
+      <span><a href="../shift/" style="color: #fcfcfc">Next &raquo;</a></span>
+    
+  </span>
+</div>
+    <script src="../../js/jquery-3.6.0.min.js"></script>
+    <script>var base_url = "../..";</script>
+    <script src="../../js/theme_extra.js"></script>
+    <script src="../../js/theme.js"></script>
+    <script>
+        jQuery(function () {
+            SphinxRtdTheme.Navigation.enable(true);
+        });
+    </script>
+
+</body>
+</html>
diff --git a/lsx/shift/index.html b/lsx/shift/index.html
new file mode 100644
index 00000000..aade3246
--- /dev/null
+++ b/lsx/shift/index.html
@@ -0,0 +1,7876 @@
+<!DOCTYPE html>
+<html class="writer-html5" lang="en" >
+<head>
+    <meta charset="utf-8" />
+    <meta http-equiv="X-UA-Compatible" content="IE=edge" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" /><link rel="canonical" href="https://jia.je/unofficial-loongarch-intrinsics-guide/lsx/shift/" />
+      <link rel="shortcut icon" href="../../img/favicon.ico" />
+    <title>Shift - Unofficial LoongArch Intrinsics Guide</title>
+    <link rel="stylesheet" href="../../css/theme.css" />
+    <link rel="stylesheet" href="../../css/theme_extra.css" />
+        <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.8.0/styles/github.min.css" />
+        <link href="../../main.css" rel="stylesheet" />
+    
+      <script>
+        // Current page data
+        var mkdocs_page_name = "Shift";
+        var mkdocs_page_input_path = "lsx/shift.md";
+        var mkdocs_page_url = "/unofficial-loongarch-intrinsics-guide/lsx/shift/";
+      </script>
+    
+    <!--[if lt IE 9]>
+      <script src="../../js/html5shiv.min.js"></script>
+    <![endif]-->
+      <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.8.0/highlight.min.js"></script>
+      <script>hljs.highlightAll();</script> 
+</head>
+
+<body class="wy-body-for-nav" role="document">
+
+  <div class="wy-grid-for-nav">
+    <nav data-toggle="wy-nav-shift" class="wy-nav-side stickynav">
+    <div class="wy-side-scroll">
+      <div class="wy-side-nav-search">
+          <a href="../.." class="icon icon-home"> Unofficial LoongArch Intrinsics Guide
+        </a>
+      </div>
+
+      <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../..">Unofficial LoongArch Intrinsics Guide</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../latency_throughput/">Latency and Throughput of Instructions</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../migrating_avx/">Migrating from AVX to LASX</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../migrating_sse/">Migrating from SSE to LSX</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../viewer/">Browse All Intrinsics</a>
+                </li>
+              </ul>
+              <p class="caption"><span class="caption-text">Lasx</span></p>
+              <ul>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/bitwise_operations/">Bitwise Operations</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/branch/">Branch</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/float_comparison/">Floating Point Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/float_computation/">Floating Point Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/float_conversion/">Floating Point Conversion</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/float_misc/">Floating Point Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/fma/">Fused Multiply-Add</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/integer_comparison/">Integer Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/integer_computation/">Integer Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/logical/">Logical</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/memory/">Memory Load & Store</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/misc/">Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/permutation/">Permutation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/shift/">Shift</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/shuffling/">Shuffling</a>
+                  </li>
+              </ul>
+              <p class="caption"><span class="caption-text">Lsx</span></p>
+              <ul class="current">
+                  <li class="toctree-l1"><a class="reference internal" href="../bitwise_operations/">Bitwise Operations</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../branch/">Branch</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_comparison/">Floating Point Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_computation/">Floating Point Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_conversion/">Floating Point Conversion</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_misc/">Floating Point Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../fma/">Fused Multiply-Add</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../integer_comparison/">Integer Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../integer_computation/">Integer Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../logical/">Logical</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../memory/">Memory Load & Store</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../misc/">Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../permutation/">Permutation</a>
+                  </li>
+                  <li class="toctree-l1 current"><a class="reference internal current" href="./">Shift</a>
+    <ul class="current">
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vbsll_v-__m128i-a-imm0_31-imm">__m128i __lsx_vbsll_v (__m128i a, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vbsrl_v-__m128i-a-imm0_31-imm">__m128i __lsx_vbsrl_v (__m128i a, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_1">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_1">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_1">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_1">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsll_b-__m128i-a-__m128i-b">__m128i __lsx_vsll_b (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_2">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_2">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_2">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_2">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsll_h-__m128i-a-__m128i-b">__m128i __lsx_vsll_h (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_3">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_3">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_3">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_3">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsll_w-__m128i-a-__m128i-b">__m128i __lsx_vsll_w (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_4">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_4">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_4">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_4">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsll_d-__m128i-a-__m128i-b">__m128i __lsx_vsll_d (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_5">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_5">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_5">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_5">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vslli_b-__m128i-a-imm0_7-imm">__m128i __lsx_vslli_b (__m128i a, imm0_7 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_6">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_6">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_6">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_6">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vslli_h-__m128i-a-imm0_15-imm">__m128i __lsx_vslli_h (__m128i a, imm0_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_7">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_7">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_7">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_7">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vslli_w-__m128i-a-imm0_31-imm">__m128i __lsx_vslli_w (__m128i a, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_8">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_8">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_8">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_8">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vslli_d-__m128i-a-imm0_63-imm">__m128i __lsx_vslli_d (__m128i a, imm0_63 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_9">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_9">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_9">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_9">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsllwil_h_b-__m128i-a-imm0_7-imm">__m128i __lsx_vsllwil_h_b (__m128i a, imm0_7 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_10">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_10">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_10">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_10">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsllwil_hu_bu-__m128i-a-imm0_7-imm">__m128i __lsx_vsllwil_hu_bu (__m128i a, imm0_7 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_11">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_11">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_11">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_11">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsllwil_w_h-__m128i-a-imm0_15-imm">__m128i __lsx_vsllwil_w_h (__m128i a, imm0_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_12">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_12">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_12">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_12">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsllwil_wu_hu-__m128i-a-imm0_15-imm">__m128i __lsx_vsllwil_wu_hu (__m128i a, imm0_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_13">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_13">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_13">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_13">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsllwil_d_w-__m128i-a-imm0_31-imm">__m128i __lsx_vsllwil_d_w (__m128i a, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_14">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_14">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_14">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_14">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsllwil_du_wu-__m128i-a-imm0_31-imm">__m128i __lsx_vsllwil_du_wu (__m128i a, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_15">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_15">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_15">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_15">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsra_b-__m128i-a-__m128i-b">__m128i __lsx_vsra_b (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_16">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_16">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_16">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_16">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsra_h-__m128i-a-__m128i-b">__m128i __lsx_vsra_h (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_17">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_17">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_17">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_17">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsra_w-__m128i-a-__m128i-b">__m128i __lsx_vsra_w (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_18">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_18">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_18">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_18">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsra_d-__m128i-a-__m128i-b">__m128i __lsx_vsra_d (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_19">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_19">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_19">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_19">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsrai_b-__m128i-a-imm0_7-imm">__m128i __lsx_vsrai_b (__m128i a, imm0_7 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_20">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_20">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_20">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_20">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsrai_h-__m128i-a-imm0_15-imm">__m128i __lsx_vsrai_h (__m128i a, imm0_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_21">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_21">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_21">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_21">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsrai_w-__m128i-a-imm0_31-imm">__m128i __lsx_vsrai_w (__m128i a, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_22">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_22">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_22">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_22">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsrai_d-__m128i-a-imm0_63-imm">__m128i __lsx_vsrai_d (__m128i a, imm0_63 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_23">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_23">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_23">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_23">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsran_b_h-__m128i-a-__m128i-b">__m128i __lsx_vsran_b_h (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_24">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_24">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_24">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_24">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsran_h_w-__m128i-a-__m128i-b">__m128i __lsx_vsran_h_w (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_25">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_25">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_25">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_25">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsran_w_d-__m128i-a-__m128i-b">__m128i __lsx_vsran_w_d (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_26">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_26">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_26">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_26">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsrani_b_h-__m128i-a-__m128i-b-imm0_15-imm">__m128i __lsx_vsrani_b_h (__m128i a, __m128i b, imm0_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_27">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_27">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_27">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_27">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsrani_h_w-__m128i-a-__m128i-b-imm0_31-imm">__m128i __lsx_vsrani_h_w (__m128i a, __m128i b, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_28">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_28">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_28">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_28">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsrani_w_d-__m128i-a-__m128i-b-imm0_63-imm">__m128i __lsx_vsrani_w_d (__m128i a, __m128i b, imm0_63 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_29">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_29">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_29">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_29">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsrani_d_q-__m128i-a-__m128i-b-imm0_127-imm">__m128i __lsx_vsrani_d_q (__m128i a, __m128i b, imm0_127 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_30">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_30">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_30">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_30">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsrar_b-__m128i-a-__m128i-b">__m128i __lsx_vsrar_b (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_31">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_31">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_31">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_31">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsrar_h-__m128i-a-__m128i-b">__m128i __lsx_vsrar_h (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_32">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_32">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_32">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_32">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsrar_w-__m128i-a-__m128i-b">__m128i __lsx_vsrar_w (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_33">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_33">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_33">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_33">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsrar_d-__m128i-a-__m128i-b">__m128i __lsx_vsrar_d (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_34">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_34">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_34">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_34">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsrari_b-__m128i-a-imm0_7-imm">__m128i __lsx_vsrari_b (__m128i a, imm0_7 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_35">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_35">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_35">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_35">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsrari_h-__m128i-a-imm0_15-imm">__m128i __lsx_vsrari_h (__m128i a, imm0_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_36">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_36">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_36">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_36">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsrari_w-__m128i-a-imm0_31-imm">__m128i __lsx_vsrari_w (__m128i a, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_37">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_37">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_37">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_37">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsrari_d-__m128i-a-imm0_63-imm">__m128i __lsx_vsrari_d (__m128i a, imm0_63 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_38">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_38">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_38">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_38">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsrarn_b_h-__m128i-a-__m128i-b">__m128i __lsx_vsrarn_b_h (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_39">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_39">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_39">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_39">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsrarn_h_w-__m128i-a-__m128i-b">__m128i __lsx_vsrarn_h_w (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_40">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_40">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_40">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_40">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsrarn_w_d-__m128i-a-__m128i-b">__m128i __lsx_vsrarn_w_d (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_41">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_41">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_41">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_41">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsrarni_b_h-__m128i-a-__m128i-b-imm0_15-imm">__m128i __lsx_vsrarni_b_h (__m128i a, __m128i b, imm0_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_42">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_42">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_42">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_42">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsrarni_h_w-__m128i-a-__m128i-b-imm0_31-imm">__m128i __lsx_vsrarni_h_w (__m128i a, __m128i b, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_43">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_43">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_43">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_43">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsrarni_w_d-__m128i-a-__m128i-b-imm0_63-imm">__m128i __lsx_vsrarni_w_d (__m128i a, __m128i b, imm0_63 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_44">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_44">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_44">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_44">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsrarni_d_q-__m128i-a-__m128i-b-imm0_127-imm">__m128i __lsx_vsrarni_d_q (__m128i a, __m128i b, imm0_127 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_45">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_45">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_45">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_45">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsrl_b-__m128i-a-__m128i-b">__m128i __lsx_vsrl_b (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_46">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_46">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_46">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_46">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsrl_h-__m128i-a-__m128i-b">__m128i __lsx_vsrl_h (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_47">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_47">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_47">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_47">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsrl_w-__m128i-a-__m128i-b">__m128i __lsx_vsrl_w (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_48">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_48">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_48">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_48">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsrl_d-__m128i-a-__m128i-b">__m128i __lsx_vsrl_d (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_49">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_49">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_49">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_49">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsrli_b-__m128i-a-imm0_7-imm">__m128i __lsx_vsrli_b (__m128i a, imm0_7 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_50">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_50">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_50">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_50">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsrli_h-__m128i-a-imm0_15-imm">__m128i __lsx_vsrli_h (__m128i a, imm0_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_51">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_51">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_51">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_51">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsrli_w-__m128i-a-imm0_31-imm">__m128i __lsx_vsrli_w (__m128i a, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_52">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_52">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_52">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_52">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsrli_d-__m128i-a-imm0_63-imm">__m128i __lsx_vsrli_d (__m128i a, imm0_63 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_53">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_53">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_53">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_53">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsrln_b_h-__m128i-a-__m128i-b">__m128i __lsx_vsrln_b_h (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_54">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_54">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_54">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_54">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsrln_h_w-__m128i-a-__m128i-b">__m128i __lsx_vsrln_h_w (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_55">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_55">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_55">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_55">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsrln_w_d-__m128i-a-__m128i-b">__m128i __lsx_vsrln_w_d (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_56">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_56">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_56">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_56">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsrlni_b_h-__m128i-a-__m128i-b-imm0_15-imm">__m128i __lsx_vsrlni_b_h (__m128i a, __m128i b, imm0_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_57">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_57">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_57">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_57">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsrlni_h_w-__m128i-a-__m128i-b-imm0_31-imm">__m128i __lsx_vsrlni_h_w (__m128i a, __m128i b, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_58">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_58">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_58">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_58">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsrlni_w_d-__m128i-a-__m128i-b-imm0_63-imm">__m128i __lsx_vsrlni_w_d (__m128i a, __m128i b, imm0_63 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_59">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_59">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_59">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_59">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsrlni_d_q-__m128i-a-__m128i-b-imm0_127-imm">__m128i __lsx_vsrlni_d_q (__m128i a, __m128i b, imm0_127 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_60">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_60">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_60">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_60">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsrlr_b-__m128i-a-__m128i-b">__m128i __lsx_vsrlr_b (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_61">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_61">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_61">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_61">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsrlr_h-__m128i-a-__m128i-b">__m128i __lsx_vsrlr_h (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_62">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_62">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_62">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_62">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsrlr_w-__m128i-a-__m128i-b">__m128i __lsx_vsrlr_w (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_63">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_63">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_63">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_63">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsrlr_d-__m128i-a-__m128i-b">__m128i __lsx_vsrlr_d (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_64">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_64">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_64">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_64">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsrlri_b-__m128i-a-imm0_7-imm">__m128i __lsx_vsrlri_b (__m128i a, imm0_7 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_65">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_65">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_65">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_65">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsrlri_h-__m128i-a-imm0_15-imm">__m128i __lsx_vsrlri_h (__m128i a, imm0_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_66">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_66">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_66">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_66">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsrlri_w-__m128i-a-imm0_31-imm">__m128i __lsx_vsrlri_w (__m128i a, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_67">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_67">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_67">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_67">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsrlri_d-__m128i-a-imm0_63-imm">__m128i __lsx_vsrlri_d (__m128i a, imm0_63 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_68">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_68">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_68">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_68">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsrlrn_b_h-__m128i-a-__m128i-b">__m128i __lsx_vsrlrn_b_h (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_69">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_69">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_69">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_69">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsrlrn_h_w-__m128i-a-__m128i-b">__m128i __lsx_vsrlrn_h_w (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_70">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_70">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_70">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_70">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsrlrn_w_d-__m128i-a-__m128i-b">__m128i __lsx_vsrlrn_w_d (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_71">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_71">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_71">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_71">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsrlrni_b_h-__m128i-a-__m128i-b-imm0_15-imm">__m128i __lsx_vsrlrni_b_h (__m128i a, __m128i b, imm0_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_72">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_72">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_72">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_72">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsrlrni_h_w-__m128i-a-__m128i-b-imm0_31-imm">__m128i __lsx_vsrlrni_h_w (__m128i a, __m128i b, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_73">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_73">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_73">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_73">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsrlrni_w_d-__m128i-a-__m128i-b-imm0_63-imm">__m128i __lsx_vsrlrni_w_d (__m128i a, __m128i b, imm0_63 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_74">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_74">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_74">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_74">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vsrlrni_d_q-__m128i-a-__m128i-b-imm0_127-imm">__m128i __lsx_vsrlrni_d_q (__m128i a, __m128i b, imm0_127 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_75">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_75">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_75">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_75">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vssran_b_h-__m128i-a-__m128i-b">__m128i __lsx_vssran_b_h (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_76">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_76">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_76">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_76">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vssran_bu_h-__m128i-a-__m128i-b">__m128i __lsx_vssran_bu_h (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_77">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_77">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_77">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_77">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vssran_h_w-__m128i-a-__m128i-b">__m128i __lsx_vssran_h_w (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_78">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_78">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_78">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_78">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vssran_hu_w-__m128i-a-__m128i-b">__m128i __lsx_vssran_hu_w (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_79">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_79">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_79">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_79">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vssran_w_d-__m128i-a-__m128i-b">__m128i __lsx_vssran_w_d (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_80">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_80">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_80">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_80">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vssran_wu_d-__m128i-a-__m128i-b">__m128i __lsx_vssran_wu_d (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_81">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_81">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_81">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_81">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vssrani_b_h-__m128i-a-__m128i-b-imm0_15-imm">__m128i __lsx_vssrani_b_h (__m128i a, __m128i b, imm0_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_82">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_82">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_82">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_82">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vssrani_bu_h-__m128i-a-__m128i-b-imm0_15-imm">__m128i __lsx_vssrani_bu_h (__m128i a, __m128i b, imm0_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_83">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_83">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_83">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_83">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vssrani_h_w-__m128i-a-__m128i-b-imm0_31-imm">__m128i __lsx_vssrani_h_w (__m128i a, __m128i b, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_84">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_84">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_84">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_84">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vssrani_hu_w-__m128i-a-__m128i-b-imm0_31-imm">__m128i __lsx_vssrani_hu_w (__m128i a, __m128i b, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_85">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_85">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_85">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_85">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vssrani_w_d-__m128i-a-__m128i-b-imm0_63-imm">__m128i __lsx_vssrani_w_d (__m128i a, __m128i b, imm0_63 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_86">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_86">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_86">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_86">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vssrani_wu_d-__m128i-a-__m128i-b-imm0_63-imm">__m128i __lsx_vssrani_wu_d (__m128i a, __m128i b, imm0_63 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_87">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_87">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_87">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_87">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vssrani_d_q-__m128i-a-__m128i-b-imm0_127-imm">__m128i __lsx_vssrani_d_q (__m128i a, __m128i b, imm0_127 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_88">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_88">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_88">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_88">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vssrani_du_q-__m128i-a-__m128i-b-imm0_127-imm">__m128i __lsx_vssrani_du_q (__m128i a, __m128i b, imm0_127 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_89">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_89">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_89">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_89">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vssrarn_b_h-__m128i-a-__m128i-b">__m128i __lsx_vssrarn_b_h (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_90">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_90">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_90">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_90">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vssrarn_bu_h-__m128i-a-__m128i-b">__m128i __lsx_vssrarn_bu_h (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_91">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_91">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_91">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_91">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vssrarn_h_w-__m128i-a-__m128i-b">__m128i __lsx_vssrarn_h_w (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_92">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_92">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_92">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_92">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vssrarn_hu_w-__m128i-a-__m128i-b">__m128i __lsx_vssrarn_hu_w (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_93">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_93">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_93">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_93">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vssrarn_w_d-__m128i-a-__m128i-b">__m128i __lsx_vssrarn_w_d (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_94">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_94">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_94">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_94">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vssrarn_wu_d-__m128i-a-__m128i-b">__m128i __lsx_vssrarn_wu_d (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_95">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_95">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_95">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_95">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vssrarni_b_h-__m128i-a-__m128i-b-imm0_15-imm">__m128i __lsx_vssrarni_b_h (__m128i a, __m128i b, imm0_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_96">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_96">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_96">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_96">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vssrarni_bu_h-__m128i-a-__m128i-b-imm0_15-imm">__m128i __lsx_vssrarni_bu_h (__m128i a, __m128i b, imm0_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_97">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_97">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_97">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_97">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vssrarni_h_w-__m128i-a-__m128i-b-imm0_31-imm">__m128i __lsx_vssrarni_h_w (__m128i a, __m128i b, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_98">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_98">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_98">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_98">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vssrarni_hu_w-__m128i-a-__m128i-b-imm0_31-imm">__m128i __lsx_vssrarni_hu_w (__m128i a, __m128i b, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_99">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_99">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_99">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_99">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vssrarni_w_d-__m128i-a-__m128i-b-imm0_63-imm">__m128i __lsx_vssrarni_w_d (__m128i a, __m128i b, imm0_63 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_100">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_100">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_100">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_100">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vssrarni_wu_d-__m128i-a-__m128i-b-imm0_63-imm">__m128i __lsx_vssrarni_wu_d (__m128i a, __m128i b, imm0_63 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_101">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_101">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_101">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_101">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vssrarni_d_q-__m128i-a-__m128i-b-imm0_127-imm">__m128i __lsx_vssrarni_d_q (__m128i a, __m128i b, imm0_127 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_102">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_102">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_102">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_102">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vssrarni_du_q-__m128i-a-__m128i-b-imm0_127-imm">__m128i __lsx_vssrarni_du_q (__m128i a, __m128i b, imm0_127 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_103">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_103">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_103">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_103">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vssrln_b_h-__m128i-a-__m128i-b">__m128i __lsx_vssrln_b_h (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_104">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_104">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_104">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_104">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vssrln_bu_h-__m128i-a-__m128i-b">__m128i __lsx_vssrln_bu_h (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_105">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_105">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_105">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_105">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vssrln_h_w-__m128i-a-__m128i-b">__m128i __lsx_vssrln_h_w (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_106">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_106">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_106">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_106">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vssrln_hu_w-__m128i-a-__m128i-b">__m128i __lsx_vssrln_hu_w (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_107">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_107">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_107">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_107">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vssrln_w_d-__m128i-a-__m128i-b">__m128i __lsx_vssrln_w_d (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_108">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_108">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_108">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_108">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vssrln_wu_d-__m128i-a-__m128i-b">__m128i __lsx_vssrln_wu_d (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_109">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_109">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_109">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_109">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vssrlni_b_h-__m128i-a-__m128i-b-imm0_15-imm">__m128i __lsx_vssrlni_b_h (__m128i a, __m128i b, imm0_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_110">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_110">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_110">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_110">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vssrlni_bu_h-__m128i-a-__m128i-b-imm0_15-imm">__m128i __lsx_vssrlni_bu_h (__m128i a, __m128i b, imm0_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_111">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_111">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_111">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_111">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vssrlni_h_w-__m128i-a-__m128i-b-imm0_31-imm">__m128i __lsx_vssrlni_h_w (__m128i a, __m128i b, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_112">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_112">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_112">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_112">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vssrlni_hu_w-__m128i-a-__m128i-b-imm0_31-imm">__m128i __lsx_vssrlni_hu_w (__m128i a, __m128i b, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_113">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_113">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_113">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_113">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vssrlni_w_d-__m128i-a-__m128i-b-imm0_63-imm">__m128i __lsx_vssrlni_w_d (__m128i a, __m128i b, imm0_63 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_114">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_114">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_114">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_114">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vssrlni_wu_d-__m128i-a-__m128i-b-imm0_63-imm">__m128i __lsx_vssrlni_wu_d (__m128i a, __m128i b, imm0_63 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_115">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_115">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_115">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_115">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vssrlni_d_q-__m128i-a-__m128i-b-imm0_127-imm">__m128i __lsx_vssrlni_d_q (__m128i a, __m128i b, imm0_127 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_116">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_116">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_116">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_116">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vssrlni_du_q-__m128i-a-__m128i-b-imm0_127-imm">__m128i __lsx_vssrlni_du_q (__m128i a, __m128i b, imm0_127 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_117">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_117">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_117">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_117">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vssrlrn_b_h-__m128i-a-__m128i-b">__m128i __lsx_vssrlrn_b_h (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_118">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_118">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_118">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_118">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vssrlrn_bu_h-__m128i-a-__m128i-b">__m128i __lsx_vssrlrn_bu_h (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_119">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_119">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_119">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_119">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vssrlrn_h_w-__m128i-a-__m128i-b">__m128i __lsx_vssrlrn_h_w (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_120">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_120">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_120">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_120">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vssrlrn_hu_w-__m128i-a-__m128i-b">__m128i __lsx_vssrlrn_hu_w (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_121">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_121">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_121">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_121">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vssrlrn_w_d-__m128i-a-__m128i-b">__m128i __lsx_vssrlrn_w_d (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_122">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_122">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_122">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_122">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vssrlrn_wu_d-__m128i-a-__m128i-b">__m128i __lsx_vssrlrn_wu_d (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_123">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_123">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_123">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_123">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vssrlrni_b_h-__m128i-a-__m128i-b-imm0_15-imm">__m128i __lsx_vssrlrni_b_h (__m128i a, __m128i b, imm0_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_124">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_124">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_124">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_124">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vssrlrni_bu_h-__m128i-a-__m128i-b-imm0_15-imm">__m128i __lsx_vssrlrni_bu_h (__m128i a, __m128i b, imm0_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_125">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_125">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_125">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_125">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vssrlrni_h_w-__m128i-a-__m128i-b-imm0_31-imm">__m128i __lsx_vssrlrni_h_w (__m128i a, __m128i b, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_126">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_126">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_126">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_126">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vssrlrni_hu_w-__m128i-a-__m128i-b-imm0_31-imm">__m128i __lsx_vssrlrni_hu_w (__m128i a, __m128i b, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_127">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_127">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_127">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_127">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vssrlrni_w_d-__m128i-a-__m128i-b-imm0_63-imm">__m128i __lsx_vssrlrni_w_d (__m128i a, __m128i b, imm0_63 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_128">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_128">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_128">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_128">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vssrlrni_wu_d-__m128i-a-__m128i-b-imm0_63-imm">__m128i __lsx_vssrlrni_wu_d (__m128i a, __m128i b, imm0_63 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_129">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_129">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_129">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_129">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vssrlrni_d_q-__m128i-a-__m128i-b-imm0_127-imm">__m128i __lsx_vssrlrni_d_q (__m128i a, __m128i b, imm0_127 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_130">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_130">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_130">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_130">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vssrlrni_du_q-__m128i-a-__m128i-b-imm0_127-imm">__m128i __lsx_vssrlrni_du_q (__m128i a, __m128i b, imm0_127 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_131">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_131">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_131">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_131">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vrotr_b-__m128i-a-__m128i-b">__m128i __lsx_vrotr_b (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_132">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_132">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_132">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_132">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vrotr_h-__m128i-a-__m128i-b">__m128i __lsx_vrotr_h (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_133">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_133">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_133">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_133">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vrotr_w-__m128i-a-__m128i-b">__m128i __lsx_vrotr_w (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_134">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_134">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_134">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_134">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vrotr_d-__m128i-a-__m128i-b">__m128i __lsx_vrotr_d (__m128i a, __m128i b)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_135">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_135">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_135">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_135">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vrotri_b-__m128i-a-imm0_7-imm">__m128i __lsx_vrotri_b (__m128i a, imm0_7 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_136">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_136">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_136">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_136">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vrotri_h-__m128i-a-imm0_15-imm">__m128i __lsx_vrotri_h (__m128i a, imm0_15 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_137">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_137">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_137">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_137">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vrotri_w-__m128i-a-imm0_31-imm">__m128i __lsx_vrotri_w (__m128i a, imm0_31 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_138">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_138">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_138">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_138">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vrotri_d-__m128i-a-imm0_63-imm">__m128i __lsx_vrotri_d (__m128i a, imm0_63 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_139">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_139">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_139">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_139">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    </ul>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../shuffling/">Shuffling</a>
+                  </li>
+              </ul>
+      </div>
+    </div>
+    </nav>
+
+    <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
+      <nav class="wy-nav-top" role="navigation" aria-label="Mobile navigation menu">
+          <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
+          <a href="../..">Unofficial LoongArch Intrinsics Guide</a>
+        
+      </nav>
+      <div class="wy-nav-content">
+        <div class="rst-content"><div role="navigation" aria-label="breadcrumbs navigation">
+  <ul class="wy-breadcrumbs">
+    <li><a href="../.." class="icon icon-home" aria-label="Docs"></a></li>
+          <li class="breadcrumb-item">Lsx</li>
+      <li class="breadcrumb-item active">Shift</li>
+    <li class="wy-breadcrumbs-aside">
+    </li>
+  </ul>
+  <hr/>
+</div>
+          <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
+            <div class="section" itemprop="articleBody">
+              
+                <h1 id="shift">Shift</h1>
+<h2 id="__m128i-__lsx_vbsll_v-__m128i-a-imm0_31-imm">__m128i __lsx_vbsll_v (__m128i a, imm0_31 imm)</h2>
+<h3 id="synopsis">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vbsll_v (__m128i a, imm0_31 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vbsll.v vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description">Description</h3>
+<p>Compute whole vector <code>a</code> shifted left by <code>imm * 8</code> bits.</p>
+<h3 id="operation">Operation</h3>
+<pre><code class="language-c++">int shift = (imm * 8) % 128;
+dst.qword[0] = (u128)a.qword[0] &lt;&lt; shift;
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vbsrl_v-__m128i-a-imm0_31-imm">__m128i __lsx_vbsrl_v (__m128i a, imm0_31 imm)</h2>
+<h3 id="synopsis_1">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vbsrl_v (__m128i a, imm0_31 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vbsrl.v vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_1">Description</h3>
+<p>Compute whole vector <code>a</code> shifted right by <code>imm * 8</code> bits.</p>
+<h3 id="operation_1">Operation</h3>
+<pre><code class="language-c++">int shift = (imm * 8) % 128;
+dst.qword[0] = (u128)a.qword[0] &gt;&gt; shift;
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_1">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsll_b-__m128i-a-__m128i-b">__m128i __lsx_vsll_b (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_2">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsll_b (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsll.b vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_2">Description</h3>
+<p>Logical left shift the unsigned 8-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_2">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = a.byte[i] &lt;&lt; (b.byte[i] &amp; 0x7);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_2">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsll_h-__m128i-a-__m128i-b">__m128i __lsx_vsll_h (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_3">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsll_h (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsll.h vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_3">Description</h3>
+<p>Logical left shift the unsigned 16-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_3">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = a.half[i] &lt;&lt; (b.half[i] &amp; 0xf);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_3">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsll_w-__m128i-a-__m128i-b">__m128i __lsx_vsll_w (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_4">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsll_w (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsll.w vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_4">Description</h3>
+<p>Logical left shift the unsigned 32-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_4">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = a.word[i] &lt;&lt; (b.word[i] &amp; 0x1f);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_4">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsll_d-__m128i-a-__m128i-b">__m128i __lsx_vsll_d (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_5">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsll_d (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsll.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_5">Description</h3>
+<p>Logical left shift the unsigned 64-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_5">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = a.dword[i] &lt;&lt; (b.dword[i] &amp; 0x3f);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_5">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vslli_b-__m128i-a-imm0_7-imm">__m128i __lsx_vslli_b (__m128i a, imm0_7 imm)</h2>
+<h3 id="synopsis_6">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vslli_b (__m128i a, imm0_7 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vslli.b vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_6">Description</h3>
+<p>Logical left shift the unsigned 8-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_6">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = a.byte[i] &lt;&lt; imm;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_6">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vslli_h-__m128i-a-imm0_15-imm">__m128i __lsx_vslli_h (__m128i a, imm0_15 imm)</h2>
+<h3 id="synopsis_7">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vslli_h (__m128i a, imm0_15 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vslli.h vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_7">Description</h3>
+<p>Logical left shift the unsigned 16-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_7">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = a.half[i] &lt;&lt; imm;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_7">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vslli_w-__m128i-a-imm0_31-imm">__m128i __lsx_vslli_w (__m128i a, imm0_31 imm)</h2>
+<h3 id="synopsis_8">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vslli_w (__m128i a, imm0_31 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vslli.w vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_8">Description</h3>
+<p>Logical left shift the unsigned 32-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_8">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = a.word[i] &lt;&lt; imm;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_8">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vslli_d-__m128i-a-imm0_63-imm">__m128i __lsx_vslli_d (__m128i a, imm0_63 imm)</h2>
+<h3 id="synopsis_9">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vslli_d (__m128i a, imm0_63 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vslli.d vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_9">Description</h3>
+<p>Logical left shift the unsigned 64-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_9">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = a.dword[i] &lt;&lt; imm;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_9">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsllwil_h_b-__m128i-a-imm0_7-imm">__m128i __lsx_vsllwil_h_b (__m128i a, imm0_7 imm)</h2>
+<h3 id="synopsis_10">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsllwil_h_b (__m128i a, imm0_7 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsllwil.h.b vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_10">Description</h3>
+<p>Extend and shift signed 8-bit elements in <code>a</code> by <code>imm</code> to signed 16-bit result.</p>
+<h3 id="operation_10">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = (s16)(s8)a.byte[i] &lt;&lt; imm;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_10">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsllwil_hu_bu-__m128i-a-imm0_7-imm">__m128i __lsx_vsllwil_hu_bu (__m128i a, imm0_7 imm)</h2>
+<h3 id="synopsis_11">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsllwil_hu_bu (__m128i a, imm0_7 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsllwil.hu.bu vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_11">Description</h3>
+<p>Extend and shift unsigned 8-bit elements in <code>a</code> by <code>imm</code> to unsigned 16-bit result.</p>
+<h3 id="operation_11">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = (u16)(u8)a.byte[i] &lt;&lt; imm;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_11">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsllwil_w_h-__m128i-a-imm0_15-imm">__m128i __lsx_vsllwil_w_h (__m128i a, imm0_15 imm)</h2>
+<h3 id="synopsis_12">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsllwil_w_h (__m128i a, imm0_15 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsllwil.w.h vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_12">Description</h3>
+<p>Extend and shift signed 16-bit elements in <code>a</code> by <code>imm</code> to signed 32-bit result.</p>
+<h3 id="operation_12">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = (s32)(s16)a.half[i] &lt;&lt; imm;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_12">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsllwil_wu_hu-__m128i-a-imm0_15-imm">__m128i __lsx_vsllwil_wu_hu (__m128i a, imm0_15 imm)</h2>
+<h3 id="synopsis_13">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsllwil_wu_hu (__m128i a, imm0_15 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsllwil.wu.hu vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_13">Description</h3>
+<p>Extend and shift unsigned 16-bit elements in <code>a</code> by <code>imm</code> to unsigned 32-bit result.</p>
+<h3 id="operation_13">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = (u32)(u16)a.half[i] &lt;&lt; imm;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_13">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsllwil_d_w-__m128i-a-imm0_31-imm">__m128i __lsx_vsllwil_d_w (__m128i a, imm0_31 imm)</h2>
+<h3 id="synopsis_14">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsllwil_d_w (__m128i a, imm0_31 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsllwil.d.w vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_14">Description</h3>
+<p>Extend and shift signed 32-bit elements in <code>a</code> by <code>imm</code> to signed 64-bit result.</p>
+<h3 id="operation_14">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = (s64)(s32)a.word[i] &lt;&lt; imm;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_14">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsllwil_du_wu-__m128i-a-imm0_31-imm">__m128i __lsx_vsllwil_du_wu (__m128i a, imm0_31 imm)</h2>
+<h3 id="synopsis_15">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsllwil_du_wu (__m128i a, imm0_31 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsllwil.du.wu vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_15">Description</h3>
+<p>Extend and shift unsigned 32-bit elements in <code>a</code> by <code>imm</code> to unsigned 64-bit result.</p>
+<h3 id="operation_15">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = (u64)(u32)a.word[i] &lt;&lt; imm;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_15">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsra_b-__m128i-a-__m128i-b">__m128i __lsx_vsra_b (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_16">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsra_b (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsra.b vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_16">Description</h3>
+<p>Arithmetic right shift the signed 8-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_16">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = ((s8)a.byte[i]) &gt;&gt; (b.byte[i] &amp; 0x7);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_16">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsra_h-__m128i-a-__m128i-b">__m128i __lsx_vsra_h (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_17">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsra_h (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsra.h vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_17">Description</h3>
+<p>Arithmetic right shift the signed 16-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_17">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = ((s16)a.half[i]) &gt;&gt; (b.half[i] &amp; 0xf);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_17">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsra_w-__m128i-a-__m128i-b">__m128i __lsx_vsra_w (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_18">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsra_w (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsra.w vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_18">Description</h3>
+<p>Arithmetic right shift the signed 32-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_18">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = ((s32)a.word[i]) &gt;&gt; (b.word[i] &amp; 0x1f);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_18">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsra_d-__m128i-a-__m128i-b">__m128i __lsx_vsra_d (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_19">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsra_d (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsra.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_19">Description</h3>
+<p>Arithmetic right shift the signed 64-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_19">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = ((s64)a.dword[i]) &gt;&gt; (b.dword[i] &amp; 0x3f);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_19">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsrai_b-__m128i-a-imm0_7-imm">__m128i __lsx_vsrai_b (__m128i a, imm0_7 imm)</h2>
+<h3 id="synopsis_20">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsrai_b (__m128i a, imm0_7 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsrai.b vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_20">Description</h3>
+<p>Arithmetic right shift the signed 8-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_20">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = ((s8)a.byte[i]) &gt;&gt; imm;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_20">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsrai_h-__m128i-a-imm0_15-imm">__m128i __lsx_vsrai_h (__m128i a, imm0_15 imm)</h2>
+<h3 id="synopsis_21">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsrai_h (__m128i a, imm0_15 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsrai.h vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_21">Description</h3>
+<p>Arithmetic right shift the signed 16-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_21">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = ((s16)a.half[i]) &gt;&gt; imm;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_21">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsrai_w-__m128i-a-imm0_31-imm">__m128i __lsx_vsrai_w (__m128i a, imm0_31 imm)</h2>
+<h3 id="synopsis_22">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsrai_w (__m128i a, imm0_31 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsrai.w vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_22">Description</h3>
+<p>Arithmetic right shift the signed 32-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_22">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = ((s32)a.word[i]) &gt;&gt; imm;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_22">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsrai_d-__m128i-a-imm0_63-imm">__m128i __lsx_vsrai_d (__m128i a, imm0_63 imm)</h2>
+<h3 id="synopsis_23">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsrai_d (__m128i a, imm0_63 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsrai.d vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_23">Description</h3>
+<p>Arithmetic right shift the signed 64-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_23">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = ((s64)a.dword[i]) &gt;&gt; imm;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_23">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsran_b_h-__m128i-a-__m128i-b">__m128i __lsx_vsran_b_h (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_24">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsran_b_h (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsran.b.h vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_24">Description</h3>
+<p>Arithmetic right shift the signed 16-bit elements in <code>a</code> by elements in <code>b</code>, truncate to 8-bit and store the result to <code>dst</code>.</p>
+<h3 id="operation_24">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = (i &lt; 8) ? (s8)((s16)a.half[i] &gt;&gt; (b.half[i] &amp; 15)) : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_24">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsran_h_w-__m128i-a-__m128i-b">__m128i __lsx_vsran_h_w (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_25">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsran_h_w (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsran.h.w vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_25">Description</h3>
+<p>Arithmetic right shift the signed 32-bit elements in <code>a</code> by elements in <code>b</code>, truncate to 16-bit and store the result to <code>dst</code>.</p>
+<h3 id="operation_25">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = (i &lt; 4) ? (s16)((s32)a.word[i] &gt;&gt; (b.word[i] &amp; 31)) : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_25">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsran_w_d-__m128i-a-__m128i-b">__m128i __lsx_vsran_w_d (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_26">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsran_w_d (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsran.w.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_26">Description</h3>
+<p>Arithmetic right shift the signed 64-bit elements in <code>a</code> by elements in <code>b</code>, truncate to 32-bit and store the result to <code>dst</code>.</p>
+<h3 id="operation_26">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = (i &lt; 2) ? (s32)((s64)a.dword[i] &gt;&gt; (b.dword[i] &amp; 63)) : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_26">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsrani_b_h-__m128i-a-__m128i-b-imm0_15-imm">__m128i __lsx_vsrani_b_h (__m128i a, __m128i b, imm0_15 imm)</h2>
+<h3 id="synopsis_27">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsrani_b_h (__m128i a, __m128i b, imm0_15 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsrani.b.h vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_27">Description</h3>
+<p>Arithmetic right shift the signed 16-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, truncate to 8-bit and store the result to <code>dst</code>.</p>
+<h3 id="operation_27">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] =
+      (i &lt; 8) ? (s8)((s16)b.half[i] &gt;&gt; imm) : (s8)((s16)a.half[i - 8] &gt;&gt; imm);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_27">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsrani_h_w-__m128i-a-__m128i-b-imm0_31-imm">__m128i __lsx_vsrani_h_w (__m128i a, __m128i b, imm0_31 imm)</h2>
+<h3 id="synopsis_28">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsrani_h_w (__m128i a, __m128i b, imm0_31 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsrani.h.w vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_28">Description</h3>
+<p>Arithmetic right shift the signed 32-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, truncate to 16-bit and store the result to <code>dst</code>.</p>
+<h3 id="operation_28">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] =
+      (i &lt; 4) ? (s16)((s32)b.word[i] &gt;&gt; imm) : (s16)((s32)a.word[i - 4] &gt;&gt; imm);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_28">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsrani_w_d-__m128i-a-__m128i-b-imm0_63-imm">__m128i __lsx_vsrani_w_d (__m128i a, __m128i b, imm0_63 imm)</h2>
+<h3 id="synopsis_29">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsrani_w_d (__m128i a, __m128i b, imm0_63 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsrani.w.d vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_29">Description</h3>
+<p>Arithmetic right shift the signed 64-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, truncate to 32-bit and store the result to <code>dst</code>.</p>
+<h3 id="operation_29">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = (i &lt; 2) ? (s32)((s64)b.dword[i] &gt;&gt; imm)
+                        : (s32)((s64)a.dword[i - 2] &gt;&gt; imm);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_29">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsrani_d_q-__m128i-a-__m128i-b-imm0_127-imm">__m128i __lsx_vsrani_d_q (__m128i a, __m128i b, imm0_127 imm)</h2>
+<h3 id="synopsis_30">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsrani_d_q (__m128i a, __m128i b, imm0_127 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsrani.d.q vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_30">Description</h3>
+<p>Arithmetic right shift the signed 128-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, truncate to 64-bit and store the result to <code>dst</code>.</p>
+<h3 id="operation_30">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = (i &lt; 1) ? (s64)((s128)b.qword[i] &gt;&gt; imm)
+                         : (s64)((s128)a.qword[i - 1] &gt;&gt; imm);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_30">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsrar_b-__m128i-a-__m128i-b">__m128i __lsx_vsrar_b (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_31">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsrar_b (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsrar.b vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_31">Description</h3>
+<p>Arithmetic right shift (with rounding) the signed 8-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_31">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  if ((b.byte[i] &amp; 0x7) == 0) {
+    dst.byte[i] = a.byte[i];
+  } else {
+    dst.byte[i] = ((s8)a.byte[i] &gt;&gt; (b.byte[i] &amp; 0x7)) +
+                  (((s8)a.byte[i] &gt;&gt; ((b.byte[i] &amp; 0x7) - 1)) &amp; 0x1);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_31">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsrar_h-__m128i-a-__m128i-b">__m128i __lsx_vsrar_h (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_32">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsrar_h (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsrar.h vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_32">Description</h3>
+<p>Arithmetic right shift (with rounding) the signed 16-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_32">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  if ((b.half[i] &amp; 0xf) == 0) {
+    dst.half[i] = a.half[i];
+  } else {
+    dst.half[i] = ((s16)a.half[i] &gt;&gt; (b.half[i] &amp; 0xf)) +
+                  (((s16)a.half[i] &gt;&gt; ((b.half[i] &amp; 0xf) - 1)) &amp; 0x1);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_32">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsrar_w-__m128i-a-__m128i-b">__m128i __lsx_vsrar_w (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_33">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsrar_w (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsrar.w vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_33">Description</h3>
+<p>Arithmetic right shift (with rounding) the signed 32-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_33">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if ((b.word[i] &amp; 0x1f) == 0) {
+    dst.word[i] = a.word[i];
+  } else {
+    dst.word[i] = ((s32)a.word[i] &gt;&gt; (b.word[i] &amp; 0x1f)) +
+                  (((s32)a.word[i] &gt;&gt; ((b.word[i] &amp; 0x1f) - 1)) &amp; 0x1);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_33">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsrar_d-__m128i-a-__m128i-b">__m128i __lsx_vsrar_d (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_34">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsrar_d (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsrar.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_34">Description</h3>
+<p>Arithmetic right shift (with rounding) the signed 64-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_34">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  if ((b.dword[i] &amp; 0x3f) == 0) {
+    dst.dword[i] = a.dword[i];
+  } else {
+    dst.dword[i] = ((s64)a.dword[i] &gt;&gt; (b.dword[i] &amp; 0x3f)) +
+                   (((s64)a.dword[i] &gt;&gt; ((b.dword[i] &amp; 0x3f) - 1)) &amp; 0x1);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_34">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsrari_b-__m128i-a-imm0_7-imm">__m128i __lsx_vsrari_b (__m128i a, imm0_7 imm)</h2>
+<h3 id="synopsis_35">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsrari_b (__m128i a, imm0_7 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsrari.b vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_35">Description</h3>
+<p>Arithmetic right shift (with rounding) the signed 8-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_35">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  if (imm == 0) {
+    dst.byte[i] = a.byte[i];
+  } else {
+    dst.byte[i] = ((s8)a.byte[i] &gt;&gt; imm) + (((s8)a.byte[i] &gt;&gt; (imm - 1)) &amp; 0x1);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_35">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsrari_h-__m128i-a-imm0_15-imm">__m128i __lsx_vsrari_h (__m128i a, imm0_15 imm)</h2>
+<h3 id="synopsis_36">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsrari_h (__m128i a, imm0_15 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsrari.h vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_36">Description</h3>
+<p>Arithmetic right shift (with rounding) the signed 16-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_36">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  if (imm == 0) {
+    dst.half[i] = a.half[i];
+  } else {
+    dst.half[i] =
+        ((s16)a.half[i] &gt;&gt; imm) + (((s16)a.half[i] &gt;&gt; (imm - 1)) &amp; 0x1);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_36">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsrari_w-__m128i-a-imm0_31-imm">__m128i __lsx_vsrari_w (__m128i a, imm0_31 imm)</h2>
+<h3 id="synopsis_37">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsrari_w (__m128i a, imm0_31 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsrari.w vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_37">Description</h3>
+<p>Arithmetic right shift (with rounding) the signed 32-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_37">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (imm == 0) {
+    dst.word[i] = a.word[i];
+  } else {
+    dst.word[i] =
+        ((s32)a.word[i] &gt;&gt; imm) + (((s32)a.word[i] &gt;&gt; (imm - 1)) &amp; 0x1);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_37">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsrari_d-__m128i-a-imm0_63-imm">__m128i __lsx_vsrari_d (__m128i a, imm0_63 imm)</h2>
+<h3 id="synopsis_38">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsrari_d (__m128i a, imm0_63 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsrari.d vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_38">Description</h3>
+<p>Arithmetic right shift (with rounding) the signed 64-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_38">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  if (imm == 0) {
+    dst.dword[i] = a.dword[i];
+  } else {
+    dst.dword[i] =
+        ((s64)a.dword[i] &gt;&gt; imm) + (((s64)a.dword[i] &gt;&gt; (imm - 1)) &amp; 0x1);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_38">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsrarn_b_h-__m128i-a-__m128i-b">__m128i __lsx_vsrarn_b_h (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_39">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsrarn_b_h (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsrarn.b.h vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_39">Description</h3>
+<p>Arithmetic right shift (with rounding) the signed 16-bit elements in <code>a</code> by elements in <code>b</code>, truncate to 8-bit and store the result to <code>dst</code>.</p>
+<h3 id="operation_39">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  if (i &lt; 8) {
+    u8 shift = (b.half[i] &amp; 15);
+    if (shift == 0) {
+      dst.byte[i] = (s8)(s16)a.half[i];
+    } else {
+      dst.byte[i] = (s8)(((s16)a.half[i] &gt;&gt; shift) +
+                         (((s16)a.half[i] &gt;&gt; (shift - 1)) &amp; 0x1));
+    }
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_39">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsrarn_h_w-__m128i-a-__m128i-b">__m128i __lsx_vsrarn_h_w (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_40">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsrarn_h_w (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsrarn.h.w vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_40">Description</h3>
+<p>Arithmetic right shift (with rounding) the signed 32-bit elements in <code>a</code> by elements in <code>b</code>, truncate to 16-bit and store the result to <code>dst</code>.</p>
+<h3 id="operation_40">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  if (i &lt; 4) {
+    u8 shift = (b.word[i] &amp; 31);
+    if (shift == 0) {
+      dst.half[i] = (s16)(s32)a.word[i];
+    } else {
+      dst.half[i] = (s16)(((s32)a.word[i] &gt;&gt; shift) +
+                          (((s32)a.word[i] &gt;&gt; (shift - 1)) &amp; 0x1));
+    }
+  } else {
+    dst.half[i] = 0;
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_40">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsrarn_w_d-__m128i-a-__m128i-b">__m128i __lsx_vsrarn_w_d (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_41">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsrarn_w_d (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsrarn.w.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_41">Description</h3>
+<p>Arithmetic right shift (with rounding) the signed 64-bit elements in <code>a</code> by elements in <code>b</code>, truncate to 32-bit and store the result to <code>dst</code>.</p>
+<h3 id="operation_41">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (i &lt; 2) {
+    u8 shift = (b.dword[i] &amp; 63);
+    if (shift == 0) {
+      dst.word[i] = (s32)(s64)a.dword[i];
+    } else {
+      dst.word[i] = (s32)(((s64)a.dword[i] &gt;&gt; shift) +
+                          (((s64)a.dword[i] &gt;&gt; (shift - 1)) &amp; 0x1));
+    }
+  } else {
+    dst.word[i] = 0;
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_41">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsrarni_b_h-__m128i-a-__m128i-b-imm0_15-imm">__m128i __lsx_vsrarni_b_h (__m128i a, __m128i b, imm0_15 imm)</h2>
+<h3 id="synopsis_42">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsrarni_b_h (__m128i a, __m128i b, imm0_15 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsrarni.b.h vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_42">Description</h3>
+<p>Arithmetic right shift (with rounding) the signed 16-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, truncate to 8-bit and store the result to <code>dst</code>.</p>
+<h3 id="operation_42">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  if (i &lt; 8) {
+    if (imm == 0) {
+      dst.byte[i] = (s8)(s16)b.half[i];
+    } else {
+      dst.byte[i] =
+          (s8)(((s16)b.half[i] &gt;&gt; imm) + (((s16)b.half[i] &gt;&gt; (imm - 1)) &amp; 0x1));
+    }
+  } else {
+    if (imm == 0) {
+      dst.byte[i] = (s8)(s16)a.half[i - 8];
+    } else {
+      dst.byte[i] = (s8)(((s16)a.half[i - 8] &gt;&gt; imm) +
+                         (((s16)a.half[i - 8] &gt;&gt; (imm - 1)) &amp; 0x1));
+    }
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_42">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsrarni_h_w-__m128i-a-__m128i-b-imm0_31-imm">__m128i __lsx_vsrarni_h_w (__m128i a, __m128i b, imm0_31 imm)</h2>
+<h3 id="synopsis_43">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsrarni_h_w (__m128i a, __m128i b, imm0_31 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsrarni.h.w vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_43">Description</h3>
+<p>Arithmetic right shift (with rounding) the signed 32-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, truncate to 16-bit and store the result to <code>dst</code>.</p>
+<h3 id="operation_43">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  if (i &lt; 4) {
+    if (imm == 0) {
+      dst.half[i] = (s16)(s32)b.word[i];
+    } else {
+      dst.half[i] = (s16)(((s32)b.word[i] &gt;&gt; imm) +
+                          (((s32)b.word[i] &gt;&gt; (imm - 1)) &amp; 0x1));
+    }
+  } else {
+    if (imm == 0) {
+      dst.half[i] = (s16)(s32)a.word[i - 4];
+    } else {
+      dst.half[i] = (s16)(((s32)a.word[i - 4] &gt;&gt; imm) +
+                          (((s32)a.word[i - 4] &gt;&gt; (imm - 1)) &amp; 0x1));
+    }
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_43">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsrarni_w_d-__m128i-a-__m128i-b-imm0_63-imm">__m128i __lsx_vsrarni_w_d (__m128i a, __m128i b, imm0_63 imm)</h2>
+<h3 id="synopsis_44">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsrarni_w_d (__m128i a, __m128i b, imm0_63 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsrarni.w.d vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_44">Description</h3>
+<p>Arithmetic right shift (with rounding) the signed 64-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, truncate to 32-bit and store the result to <code>dst</code>.</p>
+<h3 id="operation_44">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (i &lt; 2) {
+    if (imm == 0) {
+      dst.word[i] = (s32)(s64)b.dword[i];
+    } else {
+      dst.word[i] = (s32)(((s64)b.dword[i] &gt;&gt; imm) +
+                          (((s64)b.dword[i] &gt;&gt; (imm - 1)) &amp; 0x1));
+    }
+  } else {
+    if (imm == 0) {
+      dst.word[i] = (s32)(s64)a.dword[i - 2];
+    } else {
+      dst.word[i] = (s32)(((s64)a.dword[i - 2] &gt;&gt; imm) +
+                          (((s64)a.dword[i - 2] &gt;&gt; (imm - 1)) &amp; 0x1));
+    }
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_44">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsrarni_d_q-__m128i-a-__m128i-b-imm0_127-imm">__m128i __lsx_vsrarni_d_q (__m128i a, __m128i b, imm0_127 imm)</h2>
+<h3 id="synopsis_45">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsrarni_d_q (__m128i a, __m128i b, imm0_127 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsrarni.d.q vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_45">Description</h3>
+<p>Arithmetic right shift (with rounding) the signed 128-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, truncate to 64-bit and store the result to <code>dst</code>.</p>
+<h3 id="operation_45">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  if (i &lt; 1) {
+    if (imm == 0) {
+      dst.dword[i] = (s64)(s128)b.qword[i];
+    } else {
+      dst.dword[i] = (s64)(((s128)b.qword[i] &gt;&gt; imm) +
+                           (((s128)b.qword[i] &gt;&gt; (imm - 1)) &amp; 0x1));
+    }
+  } else {
+    if (imm == 0) {
+      dst.dword[i] = (s64)(s128)a.qword[i - 1];
+    } else {
+      dst.dword[i] = (s64)(((s128)a.qword[i - 1] &gt;&gt; imm) +
+                           (((s128)a.qword[i - 1] &gt;&gt; (imm - 1)) &amp; 0x1));
+    }
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_45">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsrl_b-__m128i-a-__m128i-b">__m128i __lsx_vsrl_b (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_46">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsrl_b (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsrl.b vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_46">Description</h3>
+<p>Logical right shift the unsigned 8-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_46">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = a.byte[i] &gt;&gt; (b.byte[i] &amp; 0x7);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_46">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsrl_h-__m128i-a-__m128i-b">__m128i __lsx_vsrl_h (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_47">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsrl_h (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsrl.h vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_47">Description</h3>
+<p>Logical right shift the unsigned 16-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_47">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = a.half[i] &gt;&gt; (b.half[i] &amp; 0xf);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_47">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsrl_w-__m128i-a-__m128i-b">__m128i __lsx_vsrl_w (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_48">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsrl_w (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsrl.w vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_48">Description</h3>
+<p>Logical right shift the unsigned 32-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_48">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = a.word[i] &gt;&gt; (b.word[i] &amp; 0x1f);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_48">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsrl_d-__m128i-a-__m128i-b">__m128i __lsx_vsrl_d (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_49">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsrl_d (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsrl.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_49">Description</h3>
+<p>Logical right shift the unsigned 64-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_49">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = a.dword[i] &gt;&gt; (b.dword[i] &amp; 0x3f);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_49">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsrli_b-__m128i-a-imm0_7-imm">__m128i __lsx_vsrli_b (__m128i a, imm0_7 imm)</h2>
+<h3 id="synopsis_50">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsrli_b (__m128i a, imm0_7 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsrli.b vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_50">Description</h3>
+<p>Logical right shift the unsigned 8-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_50">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = a.byte[i] &gt;&gt; imm;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_50">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsrli_h-__m128i-a-imm0_15-imm">__m128i __lsx_vsrli_h (__m128i a, imm0_15 imm)</h2>
+<h3 id="synopsis_51">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsrli_h (__m128i a, imm0_15 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsrli.h vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_51">Description</h3>
+<p>Logical right shift the unsigned 16-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_51">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = a.half[i] &gt;&gt; imm;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_51">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsrli_w-__m128i-a-imm0_31-imm">__m128i __lsx_vsrli_w (__m128i a, imm0_31 imm)</h2>
+<h3 id="synopsis_52">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsrli_w (__m128i a, imm0_31 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsrli.w vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_52">Description</h3>
+<p>Logical right shift the unsigned 32-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_52">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = a.word[i] &gt;&gt; imm;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_52">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsrli_d-__m128i-a-imm0_63-imm">__m128i __lsx_vsrli_d (__m128i a, imm0_63 imm)</h2>
+<h3 id="synopsis_53">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsrli_d (__m128i a, imm0_63 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsrli.d vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_53">Description</h3>
+<p>Logical right shift the unsigned 64-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_53">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = a.dword[i] &gt;&gt; imm;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_53">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsrln_b_h-__m128i-a-__m128i-b">__m128i __lsx_vsrln_b_h (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_54">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsrln_b_h (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsrln.b.h vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_54">Description</h3>
+<p>Logical right shift the unsigned 16-bit elements in <code>a</code> by elements in <code>b</code>, truncate to 8-bit and store the result to <code>dst</code>.</p>
+<h3 id="operation_54">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = (i &lt; 8) ? (u8)((u16)a.half[i] &gt;&gt; (b.half[i] &amp; 15)) : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_54">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsrln_h_w-__m128i-a-__m128i-b">__m128i __lsx_vsrln_h_w (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_55">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsrln_h_w (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsrln.h.w vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_55">Description</h3>
+<p>Logical right shift the unsigned 32-bit elements in <code>a</code> by elements in <code>b</code>, truncate to 16-bit and store the result to <code>dst</code>.</p>
+<h3 id="operation_55">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = (i &lt; 4) ? (u16)((u32)a.word[i] &gt;&gt; (b.word[i] &amp; 31)) : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_55">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsrln_w_d-__m128i-a-__m128i-b">__m128i __lsx_vsrln_w_d (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_56">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsrln_w_d (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsrln.w.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_56">Description</h3>
+<p>Logical right shift the unsigned 64-bit elements in <code>a</code> by elements in <code>b</code>, truncate to 32-bit and store the result to <code>dst</code>.</p>
+<h3 id="operation_56">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = (i &lt; 2) ? (u32)((u64)a.dword[i] &gt;&gt; (b.dword[i] &amp; 63)) : 0;
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_56">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsrlni_b_h-__m128i-a-__m128i-b-imm0_15-imm">__m128i __lsx_vsrlni_b_h (__m128i a, __m128i b, imm0_15 imm)</h2>
+<h3 id="synopsis_57">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsrlni_b_h (__m128i a, __m128i b, imm0_15 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsrlni.b.h vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_57">Description</h3>
+<p>Logical right shift the unsigned 16-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, truncate to 8-bit and store the result to <code>dst</code>.</p>
+<h3 id="operation_57">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] =
+      (i &lt; 8) ? (u8)((u16)b.half[i] &gt;&gt; imm) : (u8)((u16)a.half[i - 8] &gt;&gt; imm);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_57">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsrlni_h_w-__m128i-a-__m128i-b-imm0_31-imm">__m128i __lsx_vsrlni_h_w (__m128i a, __m128i b, imm0_31 imm)</h2>
+<h3 id="synopsis_58">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsrlni_h_w (__m128i a, __m128i b, imm0_31 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsrlni.h.w vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_58">Description</h3>
+<p>Logical right shift the unsigned 32-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, truncate to 16-bit and store the result to <code>dst</code>.</p>
+<h3 id="operation_58">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] =
+      (i &lt; 4) ? (u16)((u32)b.word[i] &gt;&gt; imm) : (u16)((u32)a.word[i - 4] &gt;&gt; imm);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_58">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsrlni_w_d-__m128i-a-__m128i-b-imm0_63-imm">__m128i __lsx_vsrlni_w_d (__m128i a, __m128i b, imm0_63 imm)</h2>
+<h3 id="synopsis_59">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsrlni_w_d (__m128i a, __m128i b, imm0_63 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsrlni.w.d vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_59">Description</h3>
+<p>Logical right shift the unsigned 64-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, truncate to 32-bit and store the result to <code>dst</code>.</p>
+<h3 id="operation_59">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = (i &lt; 2) ? (u32)((u64)b.dword[i] &gt;&gt; imm)
+                        : (u32)((u64)a.dword[i - 2] &gt;&gt; imm);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_59">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsrlni_d_q-__m128i-a-__m128i-b-imm0_127-imm">__m128i __lsx_vsrlni_d_q (__m128i a, __m128i b, imm0_127 imm)</h2>
+<h3 id="synopsis_60">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsrlni_d_q (__m128i a, __m128i b, imm0_127 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsrlni.d.q vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_60">Description</h3>
+<p>Logical right shift the unsigned 128-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, truncate to 64-bit and store the result to <code>dst</code>.</p>
+<h3 id="operation_60">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = (i &lt; 1) ? (u64)((u128)b.qword[i] &gt;&gt; imm)
+                         : (u64)((u128)a.qword[i - 1] &gt;&gt; imm);
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_60">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsrlr_b-__m128i-a-__m128i-b">__m128i __lsx_vsrlr_b (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_61">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsrlr_b (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsrlr.b vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_61">Description</h3>
+<p>Logical right shift (with rounding) the unsigned 8-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_61">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  if ((b.byte[i] &amp; 0x7) == 0) {
+    dst.byte[i] = a.byte[i];
+  } else {
+    dst.byte[i] = (a.byte[i] &gt;&gt; (b.byte[i] &amp; 0x7)) +
+                  ((a.byte[i] &gt;&gt; ((b.byte[i] &amp; 0x7) - 1)) &amp; 0x1);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_61">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsrlr_h-__m128i-a-__m128i-b">__m128i __lsx_vsrlr_h (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_62">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsrlr_h (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsrlr.h vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_62">Description</h3>
+<p>Logical right shift (with rounding) the unsigned 16-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_62">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  if ((b.half[i] &amp; 0xf) == 0) {
+    dst.half[i] = a.half[i];
+  } else {
+    dst.half[i] = (a.half[i] &gt;&gt; (b.half[i] &amp; 0xf)) +
+                  ((a.half[i] &gt;&gt; ((b.half[i] &amp; 0xf) - 1)) &amp; 0x1);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_62">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsrlr_w-__m128i-a-__m128i-b">__m128i __lsx_vsrlr_w (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_63">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsrlr_w (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsrlr.w vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_63">Description</h3>
+<p>Logical right shift (with rounding) the unsigned 32-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_63">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if ((b.word[i] &amp; 0x1f) == 0) {
+    dst.word[i] = a.word[i];
+  } else {
+    dst.word[i] = (a.word[i] &gt;&gt; (b.word[i] &amp; 0x1f)) +
+                  ((a.word[i] &gt;&gt; ((b.word[i] &amp; 0x1f) - 1)) &amp; 0x1);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_63">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsrlr_d-__m128i-a-__m128i-b">__m128i __lsx_vsrlr_d (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_64">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsrlr_d (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsrlr.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_64">Description</h3>
+<p>Logical right shift (with rounding) the unsigned 64-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_64">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  if ((b.dword[i] &amp; 0x3f) == 0) {
+    dst.dword[i] = a.dword[i];
+  } else {
+    dst.dword[i] = (a.dword[i] &gt;&gt; (b.dword[i] &amp; 0x3f)) +
+                   ((a.dword[i] &gt;&gt; ((b.dword[i] &amp; 0x3f) - 1)) &amp; 0x1);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_64">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsrlri_b-__m128i-a-imm0_7-imm">__m128i __lsx_vsrlri_b (__m128i a, imm0_7 imm)</h2>
+<h3 id="synopsis_65">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsrlri_b (__m128i a, imm0_7 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsrlri.b vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_65">Description</h3>
+<p>Logical right shift (with rounding) the unsigned 8-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_65">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  if (imm == 0) {
+    dst.byte[i] = a.byte[i];
+  } else {
+    dst.byte[i] = (a.byte[i] &gt;&gt; imm) + ((a.byte[i] &gt;&gt; (imm - 1)) &amp; 0x1);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_65">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsrlri_h-__m128i-a-imm0_15-imm">__m128i __lsx_vsrlri_h (__m128i a, imm0_15 imm)</h2>
+<h3 id="synopsis_66">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsrlri_h (__m128i a, imm0_15 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsrlri.h vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_66">Description</h3>
+<p>Logical right shift (with rounding) the unsigned 16-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_66">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  if (imm == 0) {
+    dst.half[i] = a.half[i];
+  } else {
+    dst.half[i] = (a.half[i] &gt;&gt; imm) + ((a.half[i] &gt;&gt; (imm - 1)) &amp; 0x1);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_66">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsrlri_w-__m128i-a-imm0_31-imm">__m128i __lsx_vsrlri_w (__m128i a, imm0_31 imm)</h2>
+<h3 id="synopsis_67">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsrlri_w (__m128i a, imm0_31 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsrlri.w vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_67">Description</h3>
+<p>Logical right shift (with rounding) the unsigned 32-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_67">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (imm == 0) {
+    dst.word[i] = a.word[i];
+  } else {
+    dst.word[i] = (a.word[i] &gt;&gt; imm) + ((a.word[i] &gt;&gt; (imm - 1)) &amp; 0x1);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_67">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsrlri_d-__m128i-a-imm0_63-imm">__m128i __lsx_vsrlri_d (__m128i a, imm0_63 imm)</h2>
+<h3 id="synopsis_68">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsrlri_d (__m128i a, imm0_63 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsrlri.d vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_68">Description</h3>
+<p>Logical right shift (with rounding) the unsigned 64-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_68">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  if (imm == 0) {
+    dst.dword[i] = a.dword[i];
+  } else {
+    dst.dword[i] = (a.dword[i] &gt;&gt; imm) + ((a.dword[i] &gt;&gt; (imm - 1)) &amp; 0x1);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_68">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsrlrn_b_h-__m128i-a-__m128i-b">__m128i __lsx_vsrlrn_b_h (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_69">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsrlrn_b_h (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsrlrn.b.h vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_69">Description</h3>
+<p>Logical right shift (with rounding) the unsigned 16-bit elements in <code>a</code> by elements in <code>b</code>, truncate to 8-bit and store the result to <code>dst</code>.</p>
+<h3 id="operation_69">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  if (i &lt; 8) {
+    u8 shift = (b.half[i] &amp; 15);
+    if (shift == 0) {
+      dst.byte[i] = (u8)(u16)a.half[i];
+    } else {
+      dst.byte[i] = (u8)(((u16)a.half[i] &gt;&gt; shift) +
+                         (((u16)a.half[i] &gt;&gt; (shift - 1)) &amp; 0x1));
+    }
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_69">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsrlrn_h_w-__m128i-a-__m128i-b">__m128i __lsx_vsrlrn_h_w (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_70">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsrlrn_h_w (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsrlrn.h.w vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_70">Description</h3>
+<p>Logical right shift (with rounding) the unsigned 32-bit elements in <code>a</code> by elements in <code>b</code>, truncate to 16-bit and store the result to <code>dst</code>.</p>
+<h3 id="operation_70">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  if (i &lt; 4) {
+    u8 shift = (b.word[i] &amp; 31);
+    if (shift == 0) {
+      dst.half[i] = (u16)(u32)a.word[i];
+    } else {
+      dst.half[i] = (u16)(((u32)a.word[i] &gt;&gt; shift) +
+                          (((u32)a.word[i] &gt;&gt; (shift - 1)) &amp; 0x1));
+    }
+  } else {
+    dst.half[i] = 0;
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_70">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsrlrn_w_d-__m128i-a-__m128i-b">__m128i __lsx_vsrlrn_w_d (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_71">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsrlrn_w_d (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsrlrn.w.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_71">Description</h3>
+<p>Logical right shift (with rounding) the unsigned 64-bit elements in <code>a</code> by elements in <code>b</code>, truncate to 32-bit and store the result to <code>dst</code>.</p>
+<h3 id="operation_71">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (i &lt; 2) {
+    u8 shift = (b.dword[i] &amp; 63);
+    if (shift == 0) {
+      dst.word[i] = (u32)(u64)a.dword[i];
+    } else {
+      dst.word[i] = (u32)(((u64)a.dword[i] &gt;&gt; shift) +
+                          (((u64)a.dword[i] &gt;&gt; (shift - 1)) &amp; 0x1));
+    }
+  } else {
+    dst.word[i] = 0;
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_71">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsrlrni_b_h-__m128i-a-__m128i-b-imm0_15-imm">__m128i __lsx_vsrlrni_b_h (__m128i a, __m128i b, imm0_15 imm)</h2>
+<h3 id="synopsis_72">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsrlrni_b_h (__m128i a, __m128i b, imm0_15 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsrlrni.b.h vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_72">Description</h3>
+<p>Logical right shift (with rounding) the unsigned 16-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, truncate to 8-bit and store the result to <code>dst</code>.</p>
+<h3 id="operation_72">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  if (i &lt; 8) {
+    if (imm == 0) {
+      dst.byte[i] = (u8)(u16)b.half[i];
+    } else {
+      dst.byte[i] =
+          (u8)(((u16)b.half[i] &gt;&gt; imm) + (((u16)b.half[i] &gt;&gt; (imm - 1)) &amp; 0x1));
+    }
+  } else {
+    if (imm == 0) {
+      dst.byte[i] = (u8)(u16)a.half[i - 8];
+    } else {
+      dst.byte[i] = (u8)(((u16)a.half[i - 8] &gt;&gt; imm) +
+                         (((u16)a.half[i - 8] &gt;&gt; (imm - 1)) &amp; 0x1));
+    }
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_72">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsrlrni_h_w-__m128i-a-__m128i-b-imm0_31-imm">__m128i __lsx_vsrlrni_h_w (__m128i a, __m128i b, imm0_31 imm)</h2>
+<h3 id="synopsis_73">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsrlrni_h_w (__m128i a, __m128i b, imm0_31 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsrlrni.h.w vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_73">Description</h3>
+<p>Logical right shift (with rounding) the unsigned 32-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, truncate to 16-bit and store the result to <code>dst</code>.</p>
+<h3 id="operation_73">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  if (i &lt; 4) {
+    if (imm == 0) {
+      dst.half[i] = (u16)(u32)b.word[i];
+    } else {
+      dst.half[i] = (u16)(((u32)b.word[i] &gt;&gt; imm) +
+                          (((u32)b.word[i] &gt;&gt; (imm - 1)) &amp; 0x1));
+    }
+  } else {
+    if (imm == 0) {
+      dst.half[i] = (u16)(u32)a.word[i - 4];
+    } else {
+      dst.half[i] = (u16)(((u32)a.word[i - 4] &gt;&gt; imm) +
+                          (((u32)a.word[i - 4] &gt;&gt; (imm - 1)) &amp; 0x1));
+    }
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_73">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsrlrni_w_d-__m128i-a-__m128i-b-imm0_63-imm">__m128i __lsx_vsrlrni_w_d (__m128i a, __m128i b, imm0_63 imm)</h2>
+<h3 id="synopsis_74">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsrlrni_w_d (__m128i a, __m128i b, imm0_63 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsrlrni.w.d vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_74">Description</h3>
+<p>Logical right shift (with rounding) the unsigned 64-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, truncate to 32-bit and store the result to <code>dst</code>.</p>
+<h3 id="operation_74">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (i &lt; 2) {
+    if (imm == 0) {
+      dst.word[i] = (u32)(u64)b.dword[i];
+    } else {
+      dst.word[i] = (u32)(((u64)b.dword[i] &gt;&gt; imm) +
+                          (((u64)b.dword[i] &gt;&gt; (imm - 1)) &amp; 0x1));
+    }
+  } else {
+    if (imm == 0) {
+      dst.word[i] = (u32)(u64)a.dword[i - 2];
+    } else {
+      dst.word[i] = (u32)(((u64)a.dword[i - 2] &gt;&gt; imm) +
+                          (((u64)a.dword[i - 2] &gt;&gt; (imm - 1)) &amp; 0x1));
+    }
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_74">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vsrlrni_d_q-__m128i-a-__m128i-b-imm0_127-imm">__m128i __lsx_vsrlrni_d_q (__m128i a, __m128i b, imm0_127 imm)</h2>
+<h3 id="synopsis_75">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vsrlrni_d_q (__m128i a, __m128i b, imm0_127 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vsrlrni.d.q vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_75">Description</h3>
+<p>Logical right shift (with rounding) the unsigned 128-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, truncate to 64-bit and store the result to <code>dst</code>.</p>
+<h3 id="operation_75">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  if (i &lt; 1) {
+    if (imm == 0) {
+      dst.dword[i] = (u64)(u128)b.qword[i];
+    } else {
+      dst.dword[i] = (u64)(((u128)b.qword[i] &gt;&gt; imm) +
+                           (((u128)b.qword[i] &gt;&gt; (imm - 1)) &amp; 0x1));
+    }
+  } else {
+    if (imm == 0) {
+      dst.dword[i] = (u64)(u128)a.qword[i - 1];
+    } else {
+      dst.dword[i] = (u64)(((u128)a.qword[i - 1] &gt;&gt; imm) +
+                           (((u128)a.qword[i - 1] &gt;&gt; (imm - 1)) &amp; 0x1));
+    }
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_75">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vssran_b_h-__m128i-a-__m128i-b">__m128i __lsx_vssran_b_h (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_76">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vssran_b_h (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vssran.b.h vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_76">Description</h3>
+<p>Arithmetic right shift the signed 16-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in signed 8-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_76">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  if (i &lt; 8) {
+    s16 temp = (s16)a.half[i] &gt;&gt; (b.half[i] &amp; 15);
+    dst.byte[i] = clamp&lt;s16&gt;(temp, -128, 127);
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_76">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vssran_bu_h-__m128i-a-__m128i-b">__m128i __lsx_vssran_bu_h (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_77">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vssran_bu_h (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vssran.bu.h vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_77">Description</h3>
+<p>Arithmetic right shift the signed 16-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in unsigned 8-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_77">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  if (i &lt; 8) {
+    s16 temp = (s16)a.half[i] &gt;&gt; (b.half[i] &amp; 15);
+    dst.byte[i] = clamp&lt;s16&gt;(temp, 0, 255);
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_77">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vssran_h_w-__m128i-a-__m128i-b">__m128i __lsx_vssran_h_w (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_78">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vssran_h_w (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vssran.h.w vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_78">Description</h3>
+<p>Arithmetic right shift the signed 32-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in signed 16-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_78">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  if (i &lt; 4) {
+    s32 temp = (s32)a.word[i] &gt;&gt; (b.word[i] &amp; 31);
+    dst.half[i] = clamp&lt;s32&gt;(temp, -32768, 32767);
+  } else {
+    dst.half[i] = 0;
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_78">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vssran_hu_w-__m128i-a-__m128i-b">__m128i __lsx_vssran_hu_w (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_79">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vssran_hu_w (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vssran.hu.w vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_79">Description</h3>
+<p>Arithmetic right shift the signed 32-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in unsigned 16-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_79">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  if (i &lt; 4) {
+    s32 temp = (s32)a.word[i] &gt;&gt; (b.word[i] &amp; 31);
+    dst.half[i] = clamp&lt;s32&gt;(temp, 0, 65535);
+  } else {
+    dst.half[i] = 0;
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_79">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vssran_w_d-__m128i-a-__m128i-b">__m128i __lsx_vssran_w_d (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_80">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vssran_w_d (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vssran.w.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_80">Description</h3>
+<p>Arithmetic right shift the signed 64-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in signed 32-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_80">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (i &lt; 2) {
+    s64 temp = (s64)a.dword[i] &gt;&gt; (b.dword[i] &amp; 63);
+    dst.word[i] = clamp&lt;s64&gt;(temp, -2147483648, 2147483647);
+  } else {
+    dst.word[i] = 0;
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_80">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vssran_wu_d-__m128i-a-__m128i-b">__m128i __lsx_vssran_wu_d (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_81">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vssran_wu_d (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vssran.wu.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_81">Description</h3>
+<p>Arithmetic right shift the signed 64-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in unsigned 32-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_81">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (i &lt; 2) {
+    s64 temp = (s64)a.dword[i] &gt;&gt; (b.dword[i] &amp; 63);
+    dst.word[i] = clamp&lt;s64&gt;(temp, 0, 4294967295);
+  } else {
+    dst.word[i] = 0;
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_81">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vssrani_b_h-__m128i-a-__m128i-b-imm0_15-imm">__m128i __lsx_vssrani_b_h (__m128i a, __m128i b, imm0_15 imm)</h2>
+<h3 id="synopsis_82">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vssrani_b_h (__m128i a, __m128i b, imm0_15 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vssrani.b.h vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_82">Description</h3>
+<p>Arithmetic right shift the signed 16-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in signed 8-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_82">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  if (i &lt; 8) {
+    s16 temp = (s16)b.half[i] &gt;&gt; imm;
+    dst.byte[i] = clamp&lt;s16&gt;(temp, -128, 127);
+  } else {
+    s16 temp = (s16)a.half[i - 8] &gt;&gt; imm;
+    dst.byte[i] = clamp&lt;s16&gt;(temp, -128, 127);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_82">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vssrani_bu_h-__m128i-a-__m128i-b-imm0_15-imm">__m128i __lsx_vssrani_bu_h (__m128i a, __m128i b, imm0_15 imm)</h2>
+<h3 id="synopsis_83">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vssrani_bu_h (__m128i a, __m128i b, imm0_15 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vssrani.bu.h vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_83">Description</h3>
+<p>Arithmetic right shift the signed 16-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in unsigned 8-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_83">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  if (i &lt; 8) {
+    s16 temp = (s16)b.half[i] &gt;&gt; imm;
+    dst.byte[i] = clamp&lt;s16&gt;(temp, 0, 255);
+  } else {
+    s16 temp = (s16)a.half[i - 8] &gt;&gt; imm;
+    dst.byte[i] = clamp&lt;s16&gt;(temp, 0, 255);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_83">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vssrani_h_w-__m128i-a-__m128i-b-imm0_31-imm">__m128i __lsx_vssrani_h_w (__m128i a, __m128i b, imm0_31 imm)</h2>
+<h3 id="synopsis_84">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vssrani_h_w (__m128i a, __m128i b, imm0_31 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vssrani.h.w vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_84">Description</h3>
+<p>Arithmetic right shift the signed 32-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in signed 16-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_84">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  if (i &lt; 4) {
+    s32 temp = (s32)b.word[i] &gt;&gt; imm;
+    dst.half[i] = clamp&lt;s32&gt;(temp, -32768, 32767);
+  } else {
+    s32 temp = (s32)a.word[i - 4] &gt;&gt; imm;
+    dst.half[i] = clamp&lt;s32&gt;(temp, -32768, 32767);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_84">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vssrani_hu_w-__m128i-a-__m128i-b-imm0_31-imm">__m128i __lsx_vssrani_hu_w (__m128i a, __m128i b, imm0_31 imm)</h2>
+<h3 id="synopsis_85">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vssrani_hu_w (__m128i a, __m128i b, imm0_31 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vssrani.hu.w vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_85">Description</h3>
+<p>Arithmetic right shift the signed 32-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in unsigned 16-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_85">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  if (i &lt; 4) {
+    s32 temp = (s32)b.word[i] &gt;&gt; imm;
+    dst.half[i] = clamp&lt;s32&gt;(temp, 0, 65535);
+  } else {
+    s32 temp = (s32)a.word[i - 4] &gt;&gt; imm;
+    dst.half[i] = clamp&lt;s32&gt;(temp, 0, 65535);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_85">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vssrani_w_d-__m128i-a-__m128i-b-imm0_63-imm">__m128i __lsx_vssrani_w_d (__m128i a, __m128i b, imm0_63 imm)</h2>
+<h3 id="synopsis_86">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vssrani_w_d (__m128i a, __m128i b, imm0_63 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vssrani.w.d vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_86">Description</h3>
+<p>Arithmetic right shift the signed 64-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in signed 32-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_86">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (i &lt; 2) {
+    s64 temp = (s64)b.dword[i] &gt;&gt; imm;
+    dst.word[i] = clamp&lt;s64&gt;(temp, -2147483648, 2147483647);
+  } else {
+    s64 temp = (s64)a.dword[i - 2] &gt;&gt; imm;
+    dst.word[i] = clamp&lt;s64&gt;(temp, -2147483648, 2147483647);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_86">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vssrani_wu_d-__m128i-a-__m128i-b-imm0_63-imm">__m128i __lsx_vssrani_wu_d (__m128i a, __m128i b, imm0_63 imm)</h2>
+<h3 id="synopsis_87">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vssrani_wu_d (__m128i a, __m128i b, imm0_63 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vssrani.wu.d vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_87">Description</h3>
+<p>Arithmetic right shift the signed 64-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in unsigned 32-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_87">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (i &lt; 2) {
+    s64 temp = (s64)b.dword[i] &gt;&gt; imm;
+    dst.word[i] = clamp&lt;s64&gt;(temp, 0, 4294967295);
+  } else {
+    s64 temp = (s64)a.dword[i - 2] &gt;&gt; imm;
+    dst.word[i] = clamp&lt;s64&gt;(temp, 0, 4294967295);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_87">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vssrani_d_q-__m128i-a-__m128i-b-imm0_127-imm">__m128i __lsx_vssrani_d_q (__m128i a, __m128i b, imm0_127 imm)</h2>
+<h3 id="synopsis_88">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vssrani_d_q (__m128i a, __m128i b, imm0_127 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vssrani.d.q vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_88">Description</h3>
+<p>Arithmetic right shift the signed 128-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in signed 64-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_88">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  if (i &lt; 1) {
+    s128 temp = (s128)b.qword[i] &gt;&gt; imm;
+    dst.dword[i] = clamp&lt;s128&gt;(temp, -9223372036854775808, 9223372036854775807);
+  } else {
+    s128 temp = (s128)a.qword[i - 1] &gt;&gt; imm;
+    dst.dword[i] = clamp&lt;s128&gt;(temp, -9223372036854775808, 9223372036854775807);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_88">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vssrani_du_q-__m128i-a-__m128i-b-imm0_127-imm">__m128i __lsx_vssrani_du_q (__m128i a, __m128i b, imm0_127 imm)</h2>
+<h3 id="synopsis_89">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vssrani_du_q (__m128i a, __m128i b, imm0_127 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vssrani.du.q vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_89">Description</h3>
+<p>Arithmetic right shift the signed 128-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in unsigned 64-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_89">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  if (i &lt; 1) {
+    s128 temp = (s128)b.qword[i] &gt;&gt; imm;
+    dst.dword[i] = clamp&lt;s128&gt;(temp, 0, 18446744073709551615);
+  } else {
+    s128 temp = (s128)a.qword[i - 1] &gt;&gt; imm;
+    dst.dword[i] = clamp&lt;s128&gt;(temp, 0, 18446744073709551615);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_89">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vssrarn_b_h-__m128i-a-__m128i-b">__m128i __lsx_vssrarn_b_h (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_90">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vssrarn_b_h (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vssrarn.b.h vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_90">Description</h3>
+<p>Arithmetic right shift (with rounding) the signed 16-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in signed 8-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_90">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  if (i &lt; 8) {
+    s16 temp;
+    if ((b.half[i] &amp; 15) == 0) {
+      temp = (s16)a.half[i];
+    } else {
+      temp = ((s16)a.half[i] &gt;&gt; (b.half[i] &amp; 15)) +
+             (((s16)a.half[i] &gt;&gt; ((b.half[i] &amp; 15) - 1)) &amp; 1);
+    }
+    dst.byte[i] = clamp&lt;s16&gt;(temp, -128, 127);
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_90">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vssrarn_bu_h-__m128i-a-__m128i-b">__m128i __lsx_vssrarn_bu_h (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_91">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vssrarn_bu_h (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vssrarn.bu.h vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_91">Description</h3>
+<p>Arithmetic right shift (with rounding) the signed 16-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in unsigned 8-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_91">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  if (i &lt; 8) {
+    s16 temp;
+    if ((b.half[i] &amp; 15) == 0) {
+      temp = (s16)a.half[i];
+    } else {
+      temp = ((s16)a.half[i] &gt;&gt; (b.half[i] &amp; 15)) +
+             (((s16)a.half[i] &gt;&gt; ((b.half[i] &amp; 15) - 1)) &amp; 1);
+    }
+    dst.byte[i] = clamp&lt;s16&gt;(temp, 0, 255);
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_91">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vssrarn_h_w-__m128i-a-__m128i-b">__m128i __lsx_vssrarn_h_w (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_92">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vssrarn_h_w (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vssrarn.h.w vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_92">Description</h3>
+<p>Arithmetic right shift (with rounding) the signed 32-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in signed 16-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_92">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  if (i &lt; 4) {
+    s32 temp;
+    if ((b.word[i] &amp; 31) == 0) {
+      temp = (s32)a.word[i];
+    } else {
+      temp = ((s32)a.word[i] &gt;&gt; (b.word[i] &amp; 31)) +
+             (((s32)a.word[i] &gt;&gt; ((b.word[i] &amp; 31) - 1)) &amp; 1);
+    }
+    dst.half[i] = clamp&lt;s32&gt;(temp, -32768, 32767);
+  } else {
+    dst.half[i] = 0;
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_92">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vssrarn_hu_w-__m128i-a-__m128i-b">__m128i __lsx_vssrarn_hu_w (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_93">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vssrarn_hu_w (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vssrarn.hu.w vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_93">Description</h3>
+<p>Arithmetic right shift (with rounding) the signed 32-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in unsigned 16-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_93">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  if (i &lt; 4) {
+    s32 temp;
+    if ((b.word[i] &amp; 31) == 0) {
+      temp = (s32)a.word[i];
+    } else {
+      temp = ((s32)a.word[i] &gt;&gt; (b.word[i] &amp; 31)) +
+             (((s32)a.word[i] &gt;&gt; ((b.word[i] &amp; 31) - 1)) &amp; 1);
+    }
+    dst.half[i] = clamp&lt;s32&gt;(temp, 0, 65535);
+  } else {
+    dst.half[i] = 0;
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_93">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vssrarn_w_d-__m128i-a-__m128i-b">__m128i __lsx_vssrarn_w_d (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_94">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vssrarn_w_d (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vssrarn.w.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_94">Description</h3>
+<p>Arithmetic right shift (with rounding) the signed 64-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in signed 32-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_94">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (i &lt; 2) {
+    s64 temp;
+    if ((b.dword[i] &amp; 63) == 0) {
+      temp = (s64)a.dword[i];
+    } else {
+      temp = ((s64)a.dword[i] &gt;&gt; (b.dword[i] &amp; 63)) +
+             (((s64)a.dword[i] &gt;&gt; ((b.dword[i] &amp; 63) - 1)) &amp; 1);
+    }
+    dst.word[i] = clamp&lt;s64&gt;(temp, -2147483648, 2147483647);
+  } else {
+    dst.word[i] = 0;
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_94">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vssrarn_wu_d-__m128i-a-__m128i-b">__m128i __lsx_vssrarn_wu_d (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_95">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vssrarn_wu_d (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vssrarn.wu.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_95">Description</h3>
+<p>Arithmetic right shift (with rounding) the signed 64-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in unsigned 32-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_95">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (i &lt; 2) {
+    s64 temp;
+    if ((b.dword[i] &amp; 63) == 0) {
+      temp = (s64)a.dword[i];
+    } else {
+      temp = ((s64)a.dword[i] &gt;&gt; (b.dword[i] &amp; 63)) +
+             (((s64)a.dword[i] &gt;&gt; ((b.dword[i] &amp; 63) - 1)) &amp; 1);
+    }
+    dst.word[i] = clamp&lt;s64&gt;(temp, 0, 4294967295);
+  } else {
+    dst.word[i] = 0;
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_95">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vssrarni_b_h-__m128i-a-__m128i-b-imm0_15-imm">__m128i __lsx_vssrarni_b_h (__m128i a, __m128i b, imm0_15 imm)</h2>
+<h3 id="synopsis_96">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vssrarni_b_h (__m128i a, __m128i b, imm0_15 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vssrarni.b.h vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_96">Description</h3>
+<p>Arithmetic right shift (with rounding) the signed 16-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in signed 8-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_96">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  if (i &lt; 8) {
+    s16 temp;
+    if (imm == 0) {
+      temp = (s16)b.half[i];
+    } else {
+      temp = ((s16)b.half[i] &gt;&gt; imm) + (((s16)b.half[i] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.byte[i] = clamp&lt;s16&gt;(temp, -128, 127);
+  } else {
+    s16 temp;
+    if (imm == 0) {
+      temp = (s16)a.half[i - 8];
+    } else {
+      temp =
+          ((s16)a.half[i - 8] &gt;&gt; imm) + (((s16)a.half[i - 8] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.byte[i] = clamp&lt;s16&gt;(temp, -128, 127);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_96">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vssrarni_bu_h-__m128i-a-__m128i-b-imm0_15-imm">__m128i __lsx_vssrarni_bu_h (__m128i a, __m128i b, imm0_15 imm)</h2>
+<h3 id="synopsis_97">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vssrarni_bu_h (__m128i a, __m128i b, imm0_15 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vssrarni.bu.h vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_97">Description</h3>
+<p>Arithmetic right shift (with rounding) the signed 16-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in unsigned 8-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_97">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  if (i &lt; 8) {
+    s16 temp;
+    if (imm == 0) {
+      temp = (s16)b.half[i];
+    } else {
+      temp = ((s16)b.half[i] &gt;&gt; imm) + (((s16)b.half[i] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.byte[i] = clamp&lt;s16&gt;(temp, 0, 255);
+  } else {
+    s16 temp;
+    if (imm == 0) {
+      temp = (s16)a.half[i - 8];
+    } else {
+      temp =
+          ((s16)a.half[i - 8] &gt;&gt; imm) + (((s16)a.half[i - 8] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.byte[i] = clamp&lt;s16&gt;(temp, 0, 255);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_97">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vssrarni_h_w-__m128i-a-__m128i-b-imm0_31-imm">__m128i __lsx_vssrarni_h_w (__m128i a, __m128i b, imm0_31 imm)</h2>
+<h3 id="synopsis_98">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vssrarni_h_w (__m128i a, __m128i b, imm0_31 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vssrarni.h.w vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_98">Description</h3>
+<p>Arithmetic right shift (with rounding) the signed 32-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in signed 16-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_98">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  if (i &lt; 4) {
+    s32 temp;
+    if (imm == 0) {
+      temp = (s32)b.word[i];
+    } else {
+      temp = ((s32)b.word[i] &gt;&gt; imm) + (((s32)b.word[i] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.half[i] = clamp&lt;s32&gt;(temp, -32768, 32767);
+  } else {
+    s32 temp;
+    if (imm == 0) {
+      temp = (s32)a.word[i - 4];
+    } else {
+      temp =
+          ((s32)a.word[i - 4] &gt;&gt; imm) + (((s32)a.word[i - 4] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.half[i] = clamp&lt;s32&gt;(temp, -32768, 32767);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_98">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vssrarni_hu_w-__m128i-a-__m128i-b-imm0_31-imm">__m128i __lsx_vssrarni_hu_w (__m128i a, __m128i b, imm0_31 imm)</h2>
+<h3 id="synopsis_99">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vssrarni_hu_w (__m128i a, __m128i b, imm0_31 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vssrarni.hu.w vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_99">Description</h3>
+<p>Arithmetic right shift (with rounding) the signed 32-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in unsigned 16-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_99">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  if (i &lt; 4) {
+    s32 temp;
+    if (imm == 0) {
+      temp = (s32)b.word[i];
+    } else {
+      temp = ((s32)b.word[i] &gt;&gt; imm) + (((s32)b.word[i] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.half[i] = clamp&lt;s32&gt;(temp, 0, 65535);
+  } else {
+    s32 temp;
+    if (imm == 0) {
+      temp = (s32)a.word[i - 4];
+    } else {
+      temp =
+          ((s32)a.word[i - 4] &gt;&gt; imm) + (((s32)a.word[i - 4] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.half[i] = clamp&lt;s32&gt;(temp, 0, 65535);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_99">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vssrarni_w_d-__m128i-a-__m128i-b-imm0_63-imm">__m128i __lsx_vssrarni_w_d (__m128i a, __m128i b, imm0_63 imm)</h2>
+<h3 id="synopsis_100">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vssrarni_w_d (__m128i a, __m128i b, imm0_63 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vssrarni.w.d vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_100">Description</h3>
+<p>Arithmetic right shift (with rounding) the signed 64-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in signed 32-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_100">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (i &lt; 2) {
+    s64 temp;
+    if (imm == 0) {
+      temp = (s64)b.dword[i];
+    } else {
+      temp = ((s64)b.dword[i] &gt;&gt; imm) + (((s64)b.dword[i] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.word[i] = clamp&lt;s64&gt;(temp, -2147483648, 2147483647);
+  } else {
+    s64 temp;
+    if (imm == 0) {
+      temp = (s64)a.dword[i - 2];
+    } else {
+      temp = ((s64)a.dword[i - 2] &gt;&gt; imm) +
+             (((s64)a.dword[i - 2] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.word[i] = clamp&lt;s64&gt;(temp, -2147483648, 2147483647);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_100">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vssrarni_wu_d-__m128i-a-__m128i-b-imm0_63-imm">__m128i __lsx_vssrarni_wu_d (__m128i a, __m128i b, imm0_63 imm)</h2>
+<h3 id="synopsis_101">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vssrarni_wu_d (__m128i a, __m128i b, imm0_63 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vssrarni.wu.d vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_101">Description</h3>
+<p>Arithmetic right shift (with rounding) the signed 64-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in unsigned 32-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_101">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (i &lt; 2) {
+    s64 temp;
+    if (imm == 0) {
+      temp = (s64)b.dword[i];
+    } else {
+      temp = ((s64)b.dword[i] &gt;&gt; imm) + (((s64)b.dword[i] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.word[i] = clamp&lt;s64&gt;(temp, 0, 4294967295);
+  } else {
+    s64 temp;
+    if (imm == 0) {
+      temp = (s64)a.dword[i - 2];
+    } else {
+      temp = ((s64)a.dword[i - 2] &gt;&gt; imm) +
+             (((s64)a.dword[i - 2] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.word[i] = clamp&lt;s64&gt;(temp, 0, 4294967295);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_101">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vssrarni_d_q-__m128i-a-__m128i-b-imm0_127-imm">__m128i __lsx_vssrarni_d_q (__m128i a, __m128i b, imm0_127 imm)</h2>
+<h3 id="synopsis_102">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vssrarni_d_q (__m128i a, __m128i b, imm0_127 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vssrarni.d.q vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_102">Description</h3>
+<p>Arithmetic right shift (with rounding) the signed 128-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in signed 64-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_102">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  if (i &lt; 1) {
+    s128 temp;
+    if (imm == 0) {
+      temp = (s128)b.qword[i];
+    } else {
+      temp = ((s128)b.qword[i] &gt;&gt; imm) + (((s128)b.qword[i] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.dword[i] = clamp&lt;s128&gt;(temp, -9223372036854775808, 9223372036854775807);
+  } else {
+    s128 temp;
+    if (imm == 0) {
+      temp = (s128)a.qword[i - 1];
+    } else {
+      temp = ((s128)a.qword[i - 1] &gt;&gt; imm) +
+             (((s128)a.qword[i - 1] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.dword[i] = clamp&lt;s128&gt;(temp, -9223372036854775808, 9223372036854775807);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_102">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vssrarni_du_q-__m128i-a-__m128i-b-imm0_127-imm">__m128i __lsx_vssrarni_du_q (__m128i a, __m128i b, imm0_127 imm)</h2>
+<h3 id="synopsis_103">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vssrarni_du_q (__m128i a, __m128i b, imm0_127 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vssrarni.du.q vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_103">Description</h3>
+<p>Arithmetic right shift (with rounding) the signed 128-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in unsigned 64-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_103">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  if (i &lt; 1) {
+    s128 temp;
+    if (imm == 0) {
+      temp = (s128)b.qword[i];
+    } else {
+      temp = ((s128)b.qword[i] &gt;&gt; imm) + (((s128)b.qword[i] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.dword[i] = clamp&lt;s128&gt;(temp, 0, 18446744073709551615);
+  } else {
+    s128 temp;
+    if (imm == 0) {
+      temp = (s128)a.qword[i - 1];
+    } else {
+      temp = ((s128)a.qword[i - 1] &gt;&gt; imm) +
+             (((s128)a.qword[i - 1] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.dword[i] = clamp&lt;s128&gt;(temp, 0, 18446744073709551615);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_103">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vssrln_b_h-__m128i-a-__m128i-b">__m128i __lsx_vssrln_b_h (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_104">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vssrln_b_h (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vssrln.b.h vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_104">Description</h3>
+<p>Logical right shift the unsigned 16-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in signed 8-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_104">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  if (i &lt; 8) {
+    u16 temp = (u16)a.half[i] &gt;&gt; (b.half[i] &amp; 15);
+    dst.byte[i] = clamp&lt;u16&gt;(temp, 0, 127);
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_104">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vssrln_bu_h-__m128i-a-__m128i-b">__m128i __lsx_vssrln_bu_h (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_105">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vssrln_bu_h (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vssrln.bu.h vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_105">Description</h3>
+<p>Logical right shift the unsigned 16-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in unsigned 8-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_105">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  if (i &lt; 8) {
+    u16 temp = (u16)a.half[i] &gt;&gt; (b.half[i] &amp; 15);
+    dst.byte[i] = clamp&lt;u16&gt;(temp, 0, 255);
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_105">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vssrln_h_w-__m128i-a-__m128i-b">__m128i __lsx_vssrln_h_w (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_106">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vssrln_h_w (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vssrln.h.w vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_106">Description</h3>
+<p>Logical right shift the unsigned 32-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in signed 16-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_106">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  if (i &lt; 4) {
+    u32 temp = (u32)a.word[i] &gt;&gt; (b.word[i] &amp; 31);
+    dst.half[i] = clamp&lt;u32&gt;(temp, 0, 32767);
+  } else {
+    dst.half[i] = 0;
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_106">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vssrln_hu_w-__m128i-a-__m128i-b">__m128i __lsx_vssrln_hu_w (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_107">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vssrln_hu_w (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vssrln.hu.w vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_107">Description</h3>
+<p>Logical right shift the unsigned 32-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in unsigned 16-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_107">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  if (i &lt; 4) {
+    u32 temp = (u32)a.word[i] &gt;&gt; (b.word[i] &amp; 31);
+    dst.half[i] = clamp&lt;u32&gt;(temp, 0, 65535);
+  } else {
+    dst.half[i] = 0;
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_107">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vssrln_w_d-__m128i-a-__m128i-b">__m128i __lsx_vssrln_w_d (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_108">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vssrln_w_d (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vssrln.w.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_108">Description</h3>
+<p>Logical right shift the unsigned 64-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in signed 32-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_108">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (i &lt; 2) {
+    u64 temp = (u64)a.dword[i] &gt;&gt; (b.dword[i] &amp; 63);
+    dst.word[i] = clamp&lt;u64&gt;(temp, 0, 2147483647);
+  } else {
+    dst.word[i] = 0;
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_108">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vssrln_wu_d-__m128i-a-__m128i-b">__m128i __lsx_vssrln_wu_d (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_109">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vssrln_wu_d (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vssrln.wu.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_109">Description</h3>
+<p>Logical right shift the unsigned 64-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in unsigned 32-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_109">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (i &lt; 2) {
+    u64 temp = (u64)a.dword[i] &gt;&gt; (b.dword[i] &amp; 63);
+    dst.word[i] = clamp&lt;u64&gt;(temp, 0, 4294967295);
+  } else {
+    dst.word[i] = 0;
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_109">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vssrlni_b_h-__m128i-a-__m128i-b-imm0_15-imm">__m128i __lsx_vssrlni_b_h (__m128i a, __m128i b, imm0_15 imm)</h2>
+<h3 id="synopsis_110">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vssrlni_b_h (__m128i a, __m128i b, imm0_15 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vssrlni.b.h vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_110">Description</h3>
+<p>Logical right shift the unsigned 16-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in signed 8-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_110">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  if (i &lt; 8) {
+    u16 temp = (u16)b.half[i] &gt;&gt; imm;
+    dst.byte[i] = clamp&lt;u16&gt;(temp, 0, 127);
+  } else {
+    u16 temp = (u16)a.half[i - 8] &gt;&gt; imm;
+    dst.byte[i] = clamp&lt;u16&gt;(temp, 0, 127);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_110">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vssrlni_bu_h-__m128i-a-__m128i-b-imm0_15-imm">__m128i __lsx_vssrlni_bu_h (__m128i a, __m128i b, imm0_15 imm)</h2>
+<h3 id="synopsis_111">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vssrlni_bu_h (__m128i a, __m128i b, imm0_15 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vssrlni.bu.h vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_111">Description</h3>
+<p>Logical right shift the unsigned 16-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in unsigned 8-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_111">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  if (i &lt; 8) {
+    u16 temp = (u16)b.half[i] &gt;&gt; imm;
+    dst.byte[i] = clamp&lt;u16&gt;(temp, 0, 255);
+  } else {
+    u16 temp = (u16)a.half[i - 8] &gt;&gt; imm;
+    dst.byte[i] = clamp&lt;u16&gt;(temp, 0, 255);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_111">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vssrlni_h_w-__m128i-a-__m128i-b-imm0_31-imm">__m128i __lsx_vssrlni_h_w (__m128i a, __m128i b, imm0_31 imm)</h2>
+<h3 id="synopsis_112">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vssrlni_h_w (__m128i a, __m128i b, imm0_31 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vssrlni.h.w vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_112">Description</h3>
+<p>Logical right shift the unsigned 32-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in signed 16-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_112">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  if (i &lt; 4) {
+    u32 temp = (u32)b.word[i] &gt;&gt; imm;
+    dst.half[i] = clamp&lt;u32&gt;(temp, 0, 32767);
+  } else {
+    u32 temp = (u32)a.word[i - 4] &gt;&gt; imm;
+    dst.half[i] = clamp&lt;u32&gt;(temp, 0, 32767);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_112">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vssrlni_hu_w-__m128i-a-__m128i-b-imm0_31-imm">__m128i __lsx_vssrlni_hu_w (__m128i a, __m128i b, imm0_31 imm)</h2>
+<h3 id="synopsis_113">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vssrlni_hu_w (__m128i a, __m128i b, imm0_31 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vssrlni.hu.w vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_113">Description</h3>
+<p>Logical right shift the unsigned 32-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in unsigned 16-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_113">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  if (i &lt; 4) {
+    u32 temp = (u32)b.word[i] &gt;&gt; imm;
+    dst.half[i] = clamp&lt;u32&gt;(temp, 0, 65535);
+  } else {
+    u32 temp = (u32)a.word[i - 4] &gt;&gt; imm;
+    dst.half[i] = clamp&lt;u32&gt;(temp, 0, 65535);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_113">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vssrlni_w_d-__m128i-a-__m128i-b-imm0_63-imm">__m128i __lsx_vssrlni_w_d (__m128i a, __m128i b, imm0_63 imm)</h2>
+<h3 id="synopsis_114">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vssrlni_w_d (__m128i a, __m128i b, imm0_63 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vssrlni.w.d vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_114">Description</h3>
+<p>Logical right shift the unsigned 64-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in signed 32-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_114">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (i &lt; 2) {
+    u64 temp = (u64)b.dword[i] &gt;&gt; imm;
+    dst.word[i] = clamp&lt;u64&gt;(temp, 0, 2147483647);
+  } else {
+    u64 temp = (u64)a.dword[i - 2] &gt;&gt; imm;
+    dst.word[i] = clamp&lt;u64&gt;(temp, 0, 2147483647);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_114">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vssrlni_wu_d-__m128i-a-__m128i-b-imm0_63-imm">__m128i __lsx_vssrlni_wu_d (__m128i a, __m128i b, imm0_63 imm)</h2>
+<h3 id="synopsis_115">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vssrlni_wu_d (__m128i a, __m128i b, imm0_63 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vssrlni.wu.d vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_115">Description</h3>
+<p>Logical right shift the unsigned 64-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in unsigned 32-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_115">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (i &lt; 2) {
+    u64 temp = (u64)b.dword[i] &gt;&gt; imm;
+    dst.word[i] = clamp&lt;u64&gt;(temp, 0, 4294967295);
+  } else {
+    u64 temp = (u64)a.dword[i - 2] &gt;&gt; imm;
+    dst.word[i] = clamp&lt;u64&gt;(temp, 0, 4294967295);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_115">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vssrlni_d_q-__m128i-a-__m128i-b-imm0_127-imm">__m128i __lsx_vssrlni_d_q (__m128i a, __m128i b, imm0_127 imm)</h2>
+<h3 id="synopsis_116">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vssrlni_d_q (__m128i a, __m128i b, imm0_127 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vssrlni.d.q vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_116">Description</h3>
+<p>Logical right shift the unsigned 128-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in signed 64-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_116">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  if (i &lt; 1) {
+    u128 temp = (u128)b.qword[i] &gt;&gt; imm;
+    dst.dword[i] = clamp&lt;u128&gt;(temp, 0, 9223372036854775807);
+  } else {
+    u128 temp = (u128)a.qword[i - 1] &gt;&gt; imm;
+    dst.dword[i] = clamp&lt;u128&gt;(temp, 0, 9223372036854775807);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_116">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vssrlni_du_q-__m128i-a-__m128i-b-imm0_127-imm">__m128i __lsx_vssrlni_du_q (__m128i a, __m128i b, imm0_127 imm)</h2>
+<h3 id="synopsis_117">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vssrlni_du_q (__m128i a, __m128i b, imm0_127 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vssrlni.du.q vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_117">Description</h3>
+<p>Logical right shift the unsigned 128-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in unsigned 64-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_117">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  if (i &lt; 1) {
+    u128 temp = (u128)b.qword[i] &gt;&gt; imm;
+    dst.dword[i] = clamp&lt;u128&gt;(temp, 0, 18446744073709551615);
+  } else {
+    u128 temp = (u128)a.qword[i - 1] &gt;&gt; imm;
+    dst.dword[i] = clamp&lt;u128&gt;(temp, 0, 18446744073709551615);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_117">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vssrlrn_b_h-__m128i-a-__m128i-b">__m128i __lsx_vssrlrn_b_h (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_118">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vssrlrn_b_h (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vssrlrn.b.h vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_118">Description</h3>
+<p>Logical right shift (with rounding) the unsigned 16-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in signed 8-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_118">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  if (i &lt; 8) {
+    u16 temp;
+    if ((b.half[i] &amp; 15) == 0) {
+      temp = (u16)a.half[i];
+    } else {
+      temp = ((u16)a.half[i] &gt;&gt; (b.half[i] &amp; 15)) +
+             (((u16)a.half[i] &gt;&gt; ((b.half[i] &amp; 15) - 1)) &amp; 1);
+    }
+    dst.byte[i] = clamp&lt;u16&gt;(temp, 0, 127);
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_118">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vssrlrn_bu_h-__m128i-a-__m128i-b">__m128i __lsx_vssrlrn_bu_h (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_119">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vssrlrn_bu_h (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vssrlrn.bu.h vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_119">Description</h3>
+<p>Logical right shift (with rounding) the unsigned 16-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in unsigned 8-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_119">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  if (i &lt; 8) {
+    u16 temp;
+    if ((b.half[i] &amp; 15) == 0) {
+      temp = (u16)a.half[i];
+    } else {
+      temp = ((u16)a.half[i] &gt;&gt; (b.half[i] &amp; 15)) +
+             (((u16)a.half[i] &gt;&gt; ((b.half[i] &amp; 15) - 1)) &amp; 1);
+    }
+    dst.byte[i] = clamp&lt;u16&gt;(temp, 0, 255);
+  } else {
+    dst.byte[i] = 0;
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_119">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vssrlrn_h_w-__m128i-a-__m128i-b">__m128i __lsx_vssrlrn_h_w (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_120">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vssrlrn_h_w (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vssrlrn.h.w vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_120">Description</h3>
+<p>Logical right shift (with rounding) the unsigned 32-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in signed 16-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_120">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  if (i &lt; 4) {
+    u32 temp;
+    if ((b.word[i] &amp; 31) == 0) {
+      temp = (u32)a.word[i];
+    } else {
+      temp = ((u32)a.word[i] &gt;&gt; (b.word[i] &amp; 31)) +
+             (((u32)a.word[i] &gt;&gt; ((b.word[i] &amp; 31) - 1)) &amp; 1);
+    }
+    dst.half[i] = clamp&lt;u32&gt;(temp, 0, 32767);
+  } else {
+    dst.half[i] = 0;
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_120">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vssrlrn_hu_w-__m128i-a-__m128i-b">__m128i __lsx_vssrlrn_hu_w (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_121">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vssrlrn_hu_w (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vssrlrn.hu.w vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_121">Description</h3>
+<p>Logical right shift (with rounding) the unsigned 32-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in unsigned 16-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_121">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  if (i &lt; 4) {
+    u32 temp;
+    if ((b.word[i] &amp; 31) == 0) {
+      temp = (u32)a.word[i];
+    } else {
+      temp = ((u32)a.word[i] &gt;&gt; (b.word[i] &amp; 31)) +
+             (((u32)a.word[i] &gt;&gt; ((b.word[i] &amp; 31) - 1)) &amp; 1);
+    }
+    dst.half[i] = clamp&lt;u32&gt;(temp, 0, 65535);
+  } else {
+    dst.half[i] = 0;
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_121">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vssrlrn_w_d-__m128i-a-__m128i-b">__m128i __lsx_vssrlrn_w_d (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_122">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vssrlrn_w_d (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vssrlrn.w.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_122">Description</h3>
+<p>Logical right shift (with rounding) the unsigned 64-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in signed 32-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_122">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (i &lt; 2) {
+    u64 temp;
+    if ((b.dword[i] &amp; 63) == 0) {
+      temp = (u64)a.dword[i];
+    } else {
+      temp = ((u64)a.dword[i] &gt;&gt; (b.dword[i] &amp; 63)) +
+             (((u64)a.dword[i] &gt;&gt; ((b.dword[i] &amp; 63) - 1)) &amp; 1);
+    }
+    dst.word[i] = clamp&lt;u64&gt;(temp, 0, 2147483647);
+  } else {
+    dst.word[i] = 0;
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_122">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vssrlrn_wu_d-__m128i-a-__m128i-b">__m128i __lsx_vssrlrn_wu_d (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_123">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vssrlrn_wu_d (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vssrlrn.wu.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_123">Description</h3>
+<p>Logical right shift (with rounding) the unsigned 64-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in unsigned 32-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_123">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (i &lt; 2) {
+    u64 temp;
+    if ((b.dword[i] &amp; 63) == 0) {
+      temp = (u64)a.dword[i];
+    } else {
+      temp = ((u64)a.dword[i] &gt;&gt; (b.dword[i] &amp; 63)) +
+             (((u64)a.dword[i] &gt;&gt; ((b.dword[i] &amp; 63) - 1)) &amp; 1);
+    }
+    dst.word[i] = clamp&lt;u64&gt;(temp, 0, 4294967295);
+  } else {
+    dst.word[i] = 0;
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_123">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vssrlrni_b_h-__m128i-a-__m128i-b-imm0_15-imm">__m128i __lsx_vssrlrni_b_h (__m128i a, __m128i b, imm0_15 imm)</h2>
+<h3 id="synopsis_124">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vssrlrni_b_h (__m128i a, __m128i b, imm0_15 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vssrlrni.b.h vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_124">Description</h3>
+<p>Logical right shift (with rounding) the unsigned 16-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in signed 8-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_124">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  if (i &lt; 8) {
+    u16 temp;
+    if (imm == 0) {
+      temp = (u16)b.half[i];
+    } else {
+      temp = ((u16)b.half[i] &gt;&gt; imm) + (((u16)b.half[i] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.byte[i] = clamp&lt;u16&gt;(temp, 0, 127);
+  } else {
+    u16 temp;
+    if (imm == 0) {
+      temp = (u16)a.half[i - 8];
+    } else {
+      temp =
+          ((u16)a.half[i - 8] &gt;&gt; imm) + (((u16)a.half[i - 8] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.byte[i] = clamp&lt;u16&gt;(temp, 0, 127);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_124">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vssrlrni_bu_h-__m128i-a-__m128i-b-imm0_15-imm">__m128i __lsx_vssrlrni_bu_h (__m128i a, __m128i b, imm0_15 imm)</h2>
+<h3 id="synopsis_125">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vssrlrni_bu_h (__m128i a, __m128i b, imm0_15 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vssrlrni.bu.h vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_125">Description</h3>
+<p>Logical right shift (with rounding) the unsigned 16-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in unsigned 8-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_125">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  if (i &lt; 8) {
+    u16 temp;
+    if (imm == 0) {
+      temp = (u16)b.half[i];
+    } else {
+      temp = ((u16)b.half[i] &gt;&gt; imm) + (((u16)b.half[i] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.byte[i] = clamp&lt;u16&gt;(temp, 0, 255);
+  } else {
+    u16 temp;
+    if (imm == 0) {
+      temp = (u16)a.half[i - 8];
+    } else {
+      temp =
+          ((u16)a.half[i - 8] &gt;&gt; imm) + (((u16)a.half[i - 8] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.byte[i] = clamp&lt;u16&gt;(temp, 0, 255);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_125">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vssrlrni_h_w-__m128i-a-__m128i-b-imm0_31-imm">__m128i __lsx_vssrlrni_h_w (__m128i a, __m128i b, imm0_31 imm)</h2>
+<h3 id="synopsis_126">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vssrlrni_h_w (__m128i a, __m128i b, imm0_31 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vssrlrni.h.w vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_126">Description</h3>
+<p>Logical right shift (with rounding) the unsigned 32-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in signed 16-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_126">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  if (i &lt; 4) {
+    u32 temp;
+    if (imm == 0) {
+      temp = (u32)b.word[i];
+    } else {
+      temp = ((u32)b.word[i] &gt;&gt; imm) + (((u32)b.word[i] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.half[i] = clamp&lt;u32&gt;(temp, 0, 32767);
+  } else {
+    u32 temp;
+    if (imm == 0) {
+      temp = (u32)a.word[i - 4];
+    } else {
+      temp =
+          ((u32)a.word[i - 4] &gt;&gt; imm) + (((u32)a.word[i - 4] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.half[i] = clamp&lt;u32&gt;(temp, 0, 32767);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_126">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vssrlrni_hu_w-__m128i-a-__m128i-b-imm0_31-imm">__m128i __lsx_vssrlrni_hu_w (__m128i a, __m128i b, imm0_31 imm)</h2>
+<h3 id="synopsis_127">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vssrlrni_hu_w (__m128i a, __m128i b, imm0_31 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vssrlrni.hu.w vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_127">Description</h3>
+<p>Logical right shift (with rounding) the unsigned 32-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in unsigned 16-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_127">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  if (i &lt; 4) {
+    u32 temp;
+    if (imm == 0) {
+      temp = (u32)b.word[i];
+    } else {
+      temp = ((u32)b.word[i] &gt;&gt; imm) + (((u32)b.word[i] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.half[i] = clamp&lt;u32&gt;(temp, 0, 65535);
+  } else {
+    u32 temp;
+    if (imm == 0) {
+      temp = (u32)a.word[i - 4];
+    } else {
+      temp =
+          ((u32)a.word[i - 4] &gt;&gt; imm) + (((u32)a.word[i - 4] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.half[i] = clamp&lt;u32&gt;(temp, 0, 65535);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_127">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vssrlrni_w_d-__m128i-a-__m128i-b-imm0_63-imm">__m128i __lsx_vssrlrni_w_d (__m128i a, __m128i b, imm0_63 imm)</h2>
+<h3 id="synopsis_128">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vssrlrni_w_d (__m128i a, __m128i b, imm0_63 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vssrlrni.w.d vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_128">Description</h3>
+<p>Logical right shift (with rounding) the unsigned 64-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in signed 32-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_128">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (i &lt; 2) {
+    u64 temp;
+    if (imm == 0) {
+      temp = (u64)b.dword[i];
+    } else {
+      temp = ((u64)b.dword[i] &gt;&gt; imm) + (((u64)b.dword[i] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.word[i] = clamp&lt;u64&gt;(temp, 0, 2147483647);
+  } else {
+    u64 temp;
+    if (imm == 0) {
+      temp = (u64)a.dword[i - 2];
+    } else {
+      temp = ((u64)a.dword[i - 2] &gt;&gt; imm) +
+             (((u64)a.dword[i - 2] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.word[i] = clamp&lt;u64&gt;(temp, 0, 2147483647);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_128">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vssrlrni_wu_d-__m128i-a-__m128i-b-imm0_63-imm">__m128i __lsx_vssrlrni_wu_d (__m128i a, __m128i b, imm0_63 imm)</h2>
+<h3 id="synopsis_129">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vssrlrni_wu_d (__m128i a, __m128i b, imm0_63 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vssrlrni.wu.d vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_129">Description</h3>
+<p>Logical right shift (with rounding) the unsigned 64-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in unsigned 32-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_129">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if (i &lt; 2) {
+    u64 temp;
+    if (imm == 0) {
+      temp = (u64)b.dword[i];
+    } else {
+      temp = ((u64)b.dword[i] &gt;&gt; imm) + (((u64)b.dword[i] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.word[i] = clamp&lt;u64&gt;(temp, 0, 4294967295);
+  } else {
+    u64 temp;
+    if (imm == 0) {
+      temp = (u64)a.dword[i - 2];
+    } else {
+      temp = ((u64)a.dword[i - 2] &gt;&gt; imm) +
+             (((u64)a.dword[i - 2] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.word[i] = clamp&lt;u64&gt;(temp, 0, 4294967295);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_129">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>4</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>4</td>
+<td>1</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vssrlrni_d_q-__m128i-a-__m128i-b-imm0_127-imm">__m128i __lsx_vssrlrni_d_q (__m128i a, __m128i b, imm0_127 imm)</h2>
+<h3 id="synopsis_130">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vssrlrni_d_q (__m128i a, __m128i b, imm0_127 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vssrlrni.d.q vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_130">Description</h3>
+<p>Logical right shift (with rounding) the unsigned 128-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in signed 64-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_130">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  if (i &lt; 1) {
+    u128 temp;
+    if (imm == 0) {
+      temp = (u128)b.qword[i];
+    } else {
+      temp = ((u128)b.qword[i] &gt;&gt; imm) + (((u128)b.qword[i] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.dword[i] = clamp&lt;u128&gt;(temp, 0, 9223372036854775807);
+  } else {
+    u128 temp;
+    if (imm == 0) {
+      temp = (u128)a.qword[i - 1];
+    } else {
+      temp = ((u128)a.qword[i - 1] &gt;&gt; imm) +
+             (((u128)a.qword[i - 1] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.dword[i] = clamp&lt;u128&gt;(temp, 0, 9223372036854775807);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_130">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vssrlrni_du_q-__m128i-a-__m128i-b-imm0_127-imm">__m128i __lsx_vssrlrni_du_q (__m128i a, __m128i b, imm0_127 imm)</h2>
+<h3 id="synopsis_131">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vssrlrni_du_q (__m128i a, __m128i b, imm0_127 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vssrlrni.du.q vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_131">Description</h3>
+<p>Logical right shift (with rounding) the unsigned 128-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in unsigned 64-bit integer and store the result to <code>dst</code>.</p>
+<h3 id="operation_131">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  if (i &lt; 1) {
+    u128 temp;
+    if (imm == 0) {
+      temp = (u128)b.qword[i];
+    } else {
+      temp = ((u128)b.qword[i] &gt;&gt; imm) + (((u128)b.qword[i] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.dword[i] = clamp&lt;u128&gt;(temp, 0, 18446744073709551615);
+  } else {
+    u128 temp;
+    if (imm == 0) {
+      temp = (u128)a.qword[i - 1];
+    } else {
+      temp = ((u128)a.qword[i - 1] &gt;&gt; imm) +
+             (((u128)a.qword[i - 1] &gt;&gt; (imm - 1)) &amp; 1);
+    }
+    dst.dword[i] = clamp&lt;u128&gt;(temp, 0, 18446744073709551615);
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_131">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>3</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vrotr_b-__m128i-a-__m128i-b">__m128i __lsx_vrotr_b (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_132">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vrotr_b (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vrotr.b vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_132">Description</h3>
+<p>Rotate right the unsigned 8-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_132">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] =
+      (a.byte[i] &gt;&gt; (b.byte[i] &amp; 0x7)) | (a.byte[i] &lt;&lt; (8 - (b.byte[i] &amp; 0x7)));
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_132">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vrotr_h-__m128i-a-__m128i-b">__m128i __lsx_vrotr_h (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_133">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vrotr_h (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vrotr.h vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_133">Description</h3>
+<p>Rotate right the unsigned 16-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_133">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = (a.half[i] &gt;&gt; (b.half[i] &amp; 0xf)) |
+                (a.half[i] &lt;&lt; (16 - (b.half[i] &amp; 0xf)));
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_133">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vrotr_w-__m128i-a-__m128i-b">__m128i __lsx_vrotr_w (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_134">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vrotr_w (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vrotr.w vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_134">Description</h3>
+<p>Rotate right the unsigned 32-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_134">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = (a.word[i] &gt;&gt; (b.word[i] &amp; 0x1f)) |
+                (a.word[i] &lt;&lt; (32 - (b.word[i] &amp; 0x1f)));
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_134">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vrotr_d-__m128i-a-__m128i-b">__m128i __lsx_vrotr_d (__m128i a, __m128i b)</h2>
+<h3 id="synopsis_135">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vrotr_d (__m128i a, __m128i b)
+#include &lt;lsxintrin.h&gt;
+Instruction: vrotr.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_135">Description</h3>
+<p>Rotate right the unsigned 64-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_135">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = (a.dword[i] &gt;&gt; (b.dword[i] &amp; 0x3f)) |
+                 (a.dword[i] &lt;&lt; (64 - (b.dword[i] &amp; 0x3f)));
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_135">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vrotri_b-__m128i-a-imm0_7-imm">__m128i __lsx_vrotri_b (__m128i a, imm0_7 imm)</h2>
+<h3 id="synopsis_136">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vrotri_b (__m128i a, imm0_7 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vrotri.b vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_136">Description</h3>
+<p>Rotate right the unsigned 8-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_136">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = (a.byte[i] &gt;&gt; imm) | (a.byte[i] &lt;&lt; (8 - imm));
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_136">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vrotri_h-__m128i-a-imm0_15-imm">__m128i __lsx_vrotri_h (__m128i a, imm0_15 imm)</h2>
+<h3 id="synopsis_137">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vrotri_h (__m128i a, imm0_15 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vrotri.h vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_137">Description</h3>
+<p>Rotate right the unsigned 16-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_137">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = (a.half[i] &gt;&gt; imm) | (a.half[i] &lt;&lt; (16 - imm));
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_137">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vrotri_w-__m128i-a-imm0_31-imm">__m128i __lsx_vrotri_w (__m128i a, imm0_31 imm)</h2>
+<h3 id="synopsis_138">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vrotri_w (__m128i a, imm0_31 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vrotri.w vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_138">Description</h3>
+<p>Rotate right the unsigned 32-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_138">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = (a.word[i] &gt;&gt; imm) | (a.word[i] &lt;&lt; (32 - imm));
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_138">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vrotri_d-__m128i-a-imm0_63-imm">__m128i __lsx_vrotri_d (__m128i a, imm0_63 imm)</h2>
+<h3 id="synopsis_139">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vrotri_d (__m128i a, imm0_63 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vrotri.d vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_139">Description</h3>
+<p>Rotate right the unsigned 64-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>
+<h3 id="operation_139">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  dst.dword[i] = (a.dword[i] &gt;&gt; imm) | (a.dword[i] &lt;&lt; (64 - imm));
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_139">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>2</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+              
+            </div>
+          </div><footer>
+    <div class="rst-footer-buttons" role="navigation" aria-label="Footer Navigation">
+        <a href="../permutation/" class="btn btn-neutral float-left" title="Permutation"><span class="icon icon-circle-arrow-left"></span> Previous</a>
+        <a href="../shuffling/" class="btn btn-neutral float-right" title="Shuffling">Next <span class="icon icon-circle-arrow-right"></span></a>
+    </div>
+
+  <hr/>
+
+  <div role="contentinfo">
+    <!-- Copyright etc -->
+      <p>Copyright &copy; 2023 Jiajie Chen</p>
+  </div>
+
+  Built with <a href="https://www.mkdocs.org/">MkDocs</a> using a <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
+</footer>
+          
+        </div>
+      </div>
+
+    </section>
+
+  </div>
+
+  <div class="rst-versions" role="note" aria-label="Versions">
+  <span class="rst-current-version" data-toggle="rst-current-version">
+    
+    
+      <span><a href="../permutation/" style="color: #fcfcfc">&laquo; Previous</a></span>
+    
+    
+      <span><a href="../shuffling/" style="color: #fcfcfc">Next &raquo;</a></span>
+    
+  </span>
+</div>
+    <script src="../../js/jquery-3.6.0.min.js"></script>
+    <script>var base_url = "../..";</script>
+    <script src="../../js/theme_extra.js"></script>
+    <script src="../../js/theme.js"></script>
+    <script>
+        jQuery(function () {
+            SphinxRtdTheme.Navigation.enable(true);
+        });
+    </script>
+
+</body>
+</html>
diff --git a/lsx/shuffling/index.html b/lsx/shuffling/index.html
new file mode 100644
index 00000000..ceadbba7
--- /dev/null
+++ b/lsx/shuffling/index.html
@@ -0,0 +1,673 @@
+<!DOCTYPE html>
+<html class="writer-html5" lang="en" >
+<head>
+    <meta charset="utf-8" />
+    <meta http-equiv="X-UA-Compatible" content="IE=edge" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" /><link rel="canonical" href="https://jia.je/unofficial-loongarch-intrinsics-guide/lsx/shuffling/" />
+      <link rel="shortcut icon" href="../../img/favicon.ico" />
+    <title>Shuffling - Unofficial LoongArch Intrinsics Guide</title>
+    <link rel="stylesheet" href="../../css/theme.css" />
+    <link rel="stylesheet" href="../../css/theme_extra.css" />
+        <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.8.0/styles/github.min.css" />
+        <link href="../../main.css" rel="stylesheet" />
+    
+      <script>
+        // Current page data
+        var mkdocs_page_name = "Shuffling";
+        var mkdocs_page_input_path = "lsx/shuffling.md";
+        var mkdocs_page_url = "/unofficial-loongarch-intrinsics-guide/lsx/shuffling/";
+      </script>
+    
+    <!--[if lt IE 9]>
+      <script src="../../js/html5shiv.min.js"></script>
+    <![endif]-->
+      <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.8.0/highlight.min.js"></script>
+      <script>hljs.highlightAll();</script> 
+</head>
+
+<body class="wy-body-for-nav" role="document">
+
+  <div class="wy-grid-for-nav">
+    <nav data-toggle="wy-nav-shift" class="wy-nav-side stickynav">
+    <div class="wy-side-scroll">
+      <div class="wy-side-nav-search">
+          <a href="../.." class="icon icon-home"> Unofficial LoongArch Intrinsics Guide
+        </a>
+      </div>
+
+      <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../..">Unofficial LoongArch Intrinsics Guide</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../latency_throughput/">Latency and Throughput of Instructions</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../migrating_avx/">Migrating from AVX to LASX</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../migrating_sse/">Migrating from SSE to LSX</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../../viewer/">Browse All Intrinsics</a>
+                </li>
+              </ul>
+              <p class="caption"><span class="caption-text">Lasx</span></p>
+              <ul>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/bitwise_operations/">Bitwise Operations</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/branch/">Branch</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/float_comparison/">Floating Point Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/float_computation/">Floating Point Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/float_conversion/">Floating Point Conversion</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/float_misc/">Floating Point Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/fma/">Fused Multiply-Add</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/integer_comparison/">Integer Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/integer_computation/">Integer Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/logical/">Logical</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/memory/">Memory Load & Store</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/misc/">Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/permutation/">Permutation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/shift/">Shift</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../../lasx/shuffling/">Shuffling</a>
+                  </li>
+              </ul>
+              <p class="caption"><span class="caption-text">Lsx</span></p>
+              <ul class="current">
+                  <li class="toctree-l1"><a class="reference internal" href="../bitwise_operations/">Bitwise Operations</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../branch/">Branch</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_comparison/">Floating Point Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_computation/">Floating Point Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_conversion/">Floating Point Conversion</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../float_misc/">Floating Point Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../fma/">Fused Multiply-Add</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../integer_comparison/">Integer Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../integer_computation/">Integer Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../logical/">Logical</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../memory/">Memory Load & Store</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../misc/">Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../permutation/">Permutation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../shift/">Shift</a>
+                  </li>
+                  <li class="toctree-l1 current"><a class="reference internal current" href="./">Shuffling</a>
+    <ul class="current">
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vshuf_b-__m128i-a-__m128i-b-__m128i-c">__m128i __lsx_vshuf_b (__m128i a, __m128i b, __m128i c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vshuf_h-__m128i-a-__m128i-b-__m128i-c">__m128i __lsx_vshuf_h (__m128i a, __m128i b, __m128i c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_1">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_1">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_1">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_1">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_1">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vshuf_w-__m128i-a-__m128i-b-__m128i-c">__m128i __lsx_vshuf_w (__m128i a, __m128i b, __m128i c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_2">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_2">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_2">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_2">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_2">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vshuf_d-__m128i-a-__m128i-b-__m128i-c">__m128i __lsx_vshuf_d (__m128i a, __m128i b, __m128i c)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_3">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_3">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_3">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_3">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_3">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vshuf4i_b-__m128i-a-imm0_255-imm">__m128i __lsx_vshuf4i_b (__m128i a, imm0_255 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_4">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_4">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_4">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_4">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_4">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vshuf4i_h-__m128i-a-imm0_255-imm">__m128i __lsx_vshuf4i_h (__m128i a, imm0_255 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_5">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_5">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_5">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_5">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_5">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vshuf4i_w-__m128i-a-imm0_255-imm">__m128i __lsx_vshuf4i_w (__m128i a, imm0_255 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_6">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_6">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_6">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_6">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_6">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    <li class="toctree-l2"><a class="reference internal" href="#__m128i-__lsx_vshuf4i_d-__m128i-a-__m128i-b-imm0_255-imm">__m128i __lsx_vshuf4i_d (__m128i a, __m128i b, imm0_255 imm)</a>
+        <ul>
+    <li class="toctree-l3"><a class="reference internal" href="#synopsis_7">Synopsis</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#description_7">Description</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#examples_7">Examples</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#operation_7">Operation</a>
+    </li>
+    <li class="toctree-l3"><a class="reference internal" href="#latency-and-throughput_7">Latency and Throughput</a>
+    </li>
+        </ul>
+    </li>
+    </ul>
+                  </li>
+              </ul>
+      </div>
+    </div>
+    </nav>
+
+    <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
+      <nav class="wy-nav-top" role="navigation" aria-label="Mobile navigation menu">
+          <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
+          <a href="../..">Unofficial LoongArch Intrinsics Guide</a>
+        
+      </nav>
+      <div class="wy-nav-content">
+        <div class="rst-content"><div role="navigation" aria-label="breadcrumbs navigation">
+  <ul class="wy-breadcrumbs">
+    <li><a href="../.." class="icon icon-home" aria-label="Docs"></a></li>
+          <li class="breadcrumb-item">Lsx</li>
+      <li class="breadcrumb-item active">Shuffling</li>
+    <li class="wy-breadcrumbs-aside">
+    </li>
+  </ul>
+  <hr/>
+</div>
+          <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
+            <div class="section" itemprop="articleBody">
+              
+                <h1 id="shuffling">Shuffling</h1>
+<h2 id="__m128i-__lsx_vshuf_b-__m128i-a-__m128i-b-__m128i-c">__m128i __lsx_vshuf_b (__m128i a, __m128i b, __m128i c)</h2>
+<h3 id="synopsis">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vshuf_b (__m128i a, __m128i b, __m128i c)
+#include &lt;lsxintrin.h&gt;
+Instruction: vshuf.b vr, vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description">Description</h3>
+<p>Shuffle bytes from <code>a</code> and <code>b</code> with indices from <code>c</code>.</p>
+<p>Caveat: the indices are placed in <code>c</code>, while in other <code>vshuf</code> intrinsics, they are placed in <code>a</code>.</p>
+<p><img alt="" src="../../diagram/vshuf_b.svg" /></p>
+<h3 id="examples">Examples</h3>
+<pre><code class="language-c++">__m128i __lsx_vshuf_b(__m128i{0x1122334455667788, 0x99aabbccddeeff00}, __m128i{0xabcdef1314156678, 0x1234123443214321}, __m128i{0x0011021304050607, 0x0811120213031404})
+= 0x7877155513efcdab 0x2177661555144413
+</code></pre>
+<h3 id="operation">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  if (c.byte[i] &gt;= 64 &amp;&amp; MACHINE_3C5000) {
+    // Caveat: observed in 3C5000
+    dst.byte[i] = 0;
+  } else if ((c.byte[i] % 32) &lt; 16) {
+    dst.byte[i] = b.byte[c.byte[i] % 16];
+  } else {
+    dst.byte[i] = a.byte[c.byte[i] % 16];
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vshuf_h-__m128i-a-__m128i-b-__m128i-c">__m128i __lsx_vshuf_h (__m128i a, __m128i b, __m128i c)</h2>
+<h3 id="synopsis_1">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vshuf_h (__m128i a, __m128i b, __m128i c)
+#include &lt;lsxintrin.h&gt;
+Instruction: vshuf.h vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_1">Description</h3>
+<p>Shuffle 16-bit elements in <code>b</code> and <code>c</code> with indices from <code>a</code>, save the result to <code>dst</code>.</p>
+<p><img alt="" src="../../diagram/vshuf_h.svg" /></p>
+<h3 id="examples_1">Examples</h3>
+<pre><code class="language-c++">__m128i __lsx_vshuf_h(__m128i{0x0001000200030004, 0x0005000a000b000c}, __m128i{0x1122334455667788, 0x99aabbccddeeff00}, __m128i{0xabcdef1314156678, 0x1234123443214321})
+= 0x1415ef13abcd4321 0x432133441122ff00
+</code></pre>
+<h3 id="operation_1">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  if ((a.half[i] % 256) &gt;= 64 &amp;&amp; MACHINE_3C5000) {
+    // Caveat: observed in 3C5000
+    dst.half[i] = 0;
+  } else if ((a.half[i] % 16) &lt; 8) {
+    dst.half[i] = c.half[a.half[i] % 8];
+  } else {
+    dst.half[i] = b.half[a.half[i] % 8];
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_1">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vshuf_w-__m128i-a-__m128i-b-__m128i-c">__m128i __lsx_vshuf_w (__m128i a, __m128i b, __m128i c)</h2>
+<h3 id="synopsis_2">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vshuf_w (__m128i a, __m128i b, __m128i c)
+#include &lt;lsxintrin.h&gt;
+Instruction: vshuf.w vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_2">Description</h3>
+<p>Shuffle 32-bit elements in <code>b</code> and <code>c</code> with indices from <code>a</code>, save the result to <code>dst</code>.</p>
+<p><img alt="" src="../../diagram/vshuf_w.svg" /></p>
+<h3 id="examples_2">Examples</h3>
+<pre><code class="language-c++">__m128i __lsx_vshuf_w(__m128i{0x0000000200000004, 0x0000000700000005}, __m128i{0x1122334455667788, 0x99aabbccddeeff00}, __m128i{0xabcdef1314156678, 0x1234123443214321})
+= 0x4321432155667788 0x99aabbcc11223344
+</code></pre>
+<h3 id="operation_2">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  if ((a.word[i] % 256) &gt;= 64 &amp;&amp; MACHINE_3C5000) {
+    // Caveat: observed in 3C5000
+    dst.word[i] = 0;
+  } else if ((a.word[i] % 8) &lt; 4) {
+    dst.word[i] = c.word[a.word[i] % 4];
+  } else {
+    dst.word[i] = b.word[a.word[i] % 4];
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_2">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vshuf_d-__m128i-a-__m128i-b-__m128i-c">__m128i __lsx_vshuf_d (__m128i a, __m128i b, __m128i c)</h2>
+<h3 id="synopsis_3">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vshuf_d (__m128i a, __m128i b, __m128i c)
+#include &lt;lsxintrin.h&gt;
+Instruction: vshuf.d vr, vr, vr
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_3">Description</h3>
+<p>Shuffle 64-bit elements in <code>b</code> and <code>c</code> with indices from <code>a</code>, save the result to <code>dst</code>.</p>
+<p><img alt="" src="../../diagram/vshuf_d.svg" /></p>
+<h3 id="examples_3">Examples</h3>
+<pre><code class="language-c++">__m128i __lsx_vshuf_d(__m128i{0x0000000000000001, 0x0000000000000002}, __m128i{0x1122334455667788, 0x99aabbccddeeff00}, __m128i{0xabcdef1314156678, 0x1234123443214321})
+= 0x1234123443214321 0x1122334455667788
+</code></pre>
+<h3 id="operation_3">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 2; i++) {
+  if ((a.dword[i] % 256) &gt;= 64 &amp;&amp; MACHINE_3C5000) {
+    // Caveat: observed in 3C5000
+    dst.dword[i] = 0;
+  } else if ((a.dword[i] % 4) &lt; 2) {
+    dst.dword[i] = c.dword[a.dword[i] % 2];
+  } else {
+    dst.dword[i] = b.dword[a.dword[i] % 2];
+  }
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_3">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vshuf4i_b-__m128i-a-imm0_255-imm">__m128i __lsx_vshuf4i_b (__m128i a, imm0_255 imm)</h2>
+<h3 id="synopsis_4">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vshuf4i_b (__m128i a, imm0_255 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vshuf4i.b vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_4">Description</h3>
+<p>Shuffle every four 8-bit elements in <code>a</code> with indices packed in <code>imm</code>, save the result to <code>dst</code>.</p>
+<p><img alt="" src="../../diagram/vshuf4i_b.svg" /></p>
+<h3 id="examples_4">Examples</h3>
+<pre><code class="language-c++">__m128i __lsx_vshuf4i_b(__m128i{0xabcdef1314156678, 0x1234123443214321}, 0x12)
+= 0x13ef13cd78667815 0x3412343421432121
+</code></pre>
+<h3 id="operation_4">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 16; i++) {
+  dst.byte[i] = a.byte[(i &amp; ~0x3) + ((imm &gt;&gt; (2 * (i &amp; 0x3))) &amp; 0x3)];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_4">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vshuf4i_h-__m128i-a-imm0_255-imm">__m128i __lsx_vshuf4i_h (__m128i a, imm0_255 imm)</h2>
+<h3 id="synopsis_5">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vshuf4i_h (__m128i a, imm0_255 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vshuf4i.h vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_5">Description</h3>
+<p>Shuffle every four 16-bit elements in <code>a</code> with indices packed in <code>imm</code>, save the result to <code>dst</code>.</p>
+<p><img alt="" src="../../diagram/vshuf4i_h.svg" /></p>
+<h3 id="examples_5">Examples</h3>
+<pre><code class="language-c++">__m128i __lsx_vshuf4i_h(__m128i{0xabcdef1314156678, 0x1234123443214321}, 0x12)
+= 0x667814156678ef13 0x4321432143211234
+</code></pre>
+<h3 id="operation_5">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 8; i++) {
+  dst.half[i] = a.half[(i &amp; ~0x3) + ((imm &gt;&gt; (2 * (i &amp; 0x3))) &amp; 0x3)];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_5">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vshuf4i_w-__m128i-a-imm0_255-imm">__m128i __lsx_vshuf4i_w (__m128i a, imm0_255 imm)</h2>
+<h3 id="synopsis_6">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vshuf4i_w (__m128i a, imm0_255 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vshuf4i.w vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_6">Description</h3>
+<p>Shuffle every four 32-bit elements in <code>a</code> with indices packed in <code>imm</code>, save the result to <code>dst</code>.</p>
+<p><img alt="" src="../../diagram/vshuf4i_w.svg" /></p>
+<h3 id="examples_6">Examples</h3>
+<pre><code class="language-c++">__m128i __lsx_vshuf4i_w(__m128i{0xabcdef1314156678, 0x1234123443214321}, 0x12)
+= 0x1415667843214321 0x14156678abcdef13
+</code></pre>
+<h3 id="operation_6">Operation</h3>
+<pre><code class="language-c++">for (int i = 0; i &lt; 4; i++) {
+  dst.word[i] = a.word[(i &amp; ~0x3) + ((imm &gt;&gt; (2 * (i &amp; 0x3))) &amp; 0x3)];
+}
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_6">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+<h2 id="__m128i-__lsx_vshuf4i_d-__m128i-a-__m128i-b-imm0_255-imm">__m128i __lsx_vshuf4i_d (__m128i a, __m128i b, imm0_255 imm)</h2>
+<h3 id="synopsis_7">Synopsis</h3>
+<pre><code class="language-c++">__m128i __lsx_vshuf4i_d (__m128i a, __m128i b, imm0_255 imm)
+#include &lt;lsxintrin.h&gt;
+Instruction: vshuf4i.d vr, vr, imm
+CPU Flags: LSX
+</code></pre>
+<h3 id="description_7">Description</h3>
+<p>Shuffle every four 64-bit elements in <code>a</code> and <code>b</code> with indices packed in <code>imm</code>, save the result to <code>dst</code>.</p>
+<p><img alt="" src="../../diagram/vshuf4i_d.svg" /></p>
+<h3 id="examples_7">Examples</h3>
+<pre><code class="language-c++">__m128i __lsx_vshuf4i_d(__m128i{0x1122334455667788, 0x99aabbccddeeff00}, __m128i{0xabcdef1314156678, 0x1234123443214321}, 0x12)
+= 0xabcdef1314156678 0x1122334455667788
+</code></pre>
+<h3 id="operation_7">Operation</h3>
+<pre><code class="language-c++">dst.dword[0] = (imm &amp; 2) ? b.dword[(imm &amp; 1)] : a.dword[(imm &amp; 1)];
+dst.dword[1] =
+    (imm &amp; 8) ? b.dword[((imm &gt;&gt; 2) &amp; 1)] : a.dword[((imm &gt;&gt; 2) &amp; 1)];
+</code></pre>
+<p>Tested on real machine.</p>
+<h3 id="latency-and-throughput_7">Latency and Throughput</h3>
+<table>
+<thead>
+<tr>
+<th>CPU</th>
+<th>Latency</th>
+<th>Throughput (CPI)</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>3A6000</td>
+<td>1</td>
+<td>4</td>
+</tr>
+<tr>
+<td>3C5000</td>
+<td>1</td>
+<td>2</td>
+</tr>
+</tbody>
+</table>
+              
+            </div>
+          </div><footer>
+    <div class="rst-footer-buttons" role="navigation" aria-label="Footer Navigation">
+        <a href="../shift/" class="btn btn-neutral float-left" title="Shift"><span class="icon icon-circle-arrow-left"></span> Previous</a>
+    </div>
+
+  <hr/>
+
+  <div role="contentinfo">
+    <!-- Copyright etc -->
+      <p>Copyright &copy; 2023 Jiajie Chen</p>
+  </div>
+
+  Built with <a href="https://www.mkdocs.org/">MkDocs</a> using a <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
+</footer>
+          
+        </div>
+      </div>
+
+    </section>
+
+  </div>
+
+  <div class="rst-versions" role="note" aria-label="Versions">
+  <span class="rst-current-version" data-toggle="rst-current-version">
+    
+    
+      <span><a href="../shift/" style="color: #fcfcfc">&laquo; Previous</a></span>
+    
+    
+  </span>
+</div>
+    <script src="../../js/jquery-3.6.0.min.js"></script>
+    <script>var base_url = "../..";</script>
+    <script src="../../js/theme_extra.js"></script>
+    <script src="../../js/theme.js"></script>
+    <script>
+        jQuery(function () {
+            SphinxRtdTheme.Navigation.enable(true);
+        });
+    </script>
+
+</body>
+</html>
diff --git a/main.css b/main.css
new file mode 100644
index 00000000..f7ffbf8f
--- /dev/null
+++ b/main.css
@@ -0,0 +1,3 @@
+[v-cloak] {
+  display: none
+}
\ No newline at end of file
diff --git a/migrating_avx/index.html b/migrating_avx/index.html
new file mode 100644
index 00000000..2b2c8661
--- /dev/null
+++ b/migrating_avx/index.html
@@ -0,0 +1,2185 @@
+<!DOCTYPE html>
+<html class="writer-html5" lang="en" >
+<head>
+    <meta charset="utf-8" />
+    <meta http-equiv="X-UA-Compatible" content="IE=edge" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" /><link rel="canonical" href="https://jia.je/unofficial-loongarch-intrinsics-guide/migrating_avx/" />
+      <link rel="shortcut icon" href="../img/favicon.ico" />
+    <title>Migrating from AVX to LASX - Unofficial LoongArch Intrinsics Guide</title>
+    <link rel="stylesheet" href="../css/theme.css" />
+    <link rel="stylesheet" href="../css/theme_extra.css" />
+        <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.8.0/styles/github.min.css" />
+        <link href="../main.css" rel="stylesheet" />
+    
+      <script>
+        // Current page data
+        var mkdocs_page_name = "Migrating from AVX to LASX";
+        var mkdocs_page_input_path = "migrating_avx.md";
+        var mkdocs_page_url = "/unofficial-loongarch-intrinsics-guide/migrating_avx/";
+      </script>
+    
+    <!--[if lt IE 9]>
+      <script src="../js/html5shiv.min.js"></script>
+    <![endif]-->
+      <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.8.0/highlight.min.js"></script>
+      <script>hljs.highlightAll();</script> 
+</head>
+
+<body class="wy-body-for-nav" role="document">
+
+  <div class="wy-grid-for-nav">
+    <nav data-toggle="wy-nav-shift" class="wy-nav-side stickynav">
+    <div class="wy-side-scroll">
+      <div class="wy-side-nav-search">
+          <a href=".." class="icon icon-home"> Unofficial LoongArch Intrinsics Guide
+        </a>
+      </div>
+
+      <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="..">Unofficial LoongArch Intrinsics Guide</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../latency_throughput/">Latency and Throughput of Instructions</a>
+                </li>
+              </ul>
+              <ul class="current">
+                <li class="toctree-l1 current"><a class="reference internal current" href="./">Migrating from AVX to LASX</a>
+    <ul class="current">
+    </ul>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../migrating_sse/">Migrating from SSE to LSX</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../viewer/">Browse All Intrinsics</a>
+                </li>
+              </ul>
+              <p class="caption"><span class="caption-text">Lasx</span></p>
+              <ul>
+                  <li class="toctree-l1"><a class="reference internal" href="../lasx/bitwise_operations/">Bitwise Operations</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lasx/branch/">Branch</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lasx/float_comparison/">Floating Point Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lasx/float_computation/">Floating Point Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lasx/float_conversion/">Floating Point Conversion</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lasx/float_misc/">Floating Point Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lasx/fma/">Fused Multiply-Add</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lasx/integer_comparison/">Integer Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lasx/integer_computation/">Integer Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lasx/logical/">Logical</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lasx/memory/">Memory Load & Store</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lasx/misc/">Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lasx/permutation/">Permutation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lasx/shift/">Shift</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lasx/shuffling/">Shuffling</a>
+                  </li>
+              </ul>
+              <p class="caption"><span class="caption-text">Lsx</span></p>
+              <ul>
+                  <li class="toctree-l1"><a class="reference internal" href="../lsx/bitwise_operations/">Bitwise Operations</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lsx/branch/">Branch</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lsx/float_comparison/">Floating Point Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lsx/float_computation/">Floating Point Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lsx/float_conversion/">Floating Point Conversion</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lsx/float_misc/">Floating Point Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lsx/fma/">Fused Multiply-Add</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lsx/integer_comparison/">Integer Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lsx/integer_computation/">Integer Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lsx/logical/">Logical</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lsx/memory/">Memory Load & Store</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lsx/misc/">Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lsx/permutation/">Permutation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lsx/shift/">Shift</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lsx/shuffling/">Shuffling</a>
+                  </li>
+              </ul>
+      </div>
+    </div>
+    </nav>
+
+    <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
+      <nav class="wy-nav-top" role="navigation" aria-label="Mobile navigation menu">
+          <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
+          <a href="..">Unofficial LoongArch Intrinsics Guide</a>
+        
+      </nav>
+      <div class="wy-nav-content">
+        <div class="rst-content"><div role="navigation" aria-label="breadcrumbs navigation">
+  <ul class="wy-breadcrumbs">
+    <li><a href=".." class="icon icon-home" aria-label="Docs"></a></li>
+      <li class="breadcrumb-item active">Migrating from AVX to LASX</li>
+    <li class="wy-breadcrumbs-aside">
+    </li>
+  </ul>
+  <hr/>
+</div>
+          <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
+            <div class="section" itemprop="articleBody">
+              
+                <h1 id="migrating-from-avx-to-lasx">Migrating from AVX to LASX</h1>
+<p>AVX is a 256-bit SIMD extension to X86. It is possible to migrate existing AVX code to leverage LoongArch LASX extension by rewriting the intrinsics or instructions manually, or by using tools like <a href="https://github.com/simd-everywhere/simde">SIMD Everywhere</a> to implement AVX intrinsics with LASX counterparts. But to unleash the full performance, you may want to port your code to LASX manually.</p>
+<p>Thankfully, LASX intrinsics adopt the same type as AVX: you can use the following familiar types for SIMD:</p>
+<ul>
+<li>__m256: 256-bit vector of single precision floating point numbers</li>
+<li>__m256d: 256-bit vector of double precision floating point numbers</li>
+<li>__m256i: 256-bit vector of integers, which can be of any width</li>
+</ul>
+<p>Here is a table of a mapping from AVX intrinsics to their LASX counterpart (WIP):</p>
+<table>
+<thead>
+<tr>
+<th>AVX</th>
+<th>LASX</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>_mm256_abs_epi16</td>
+<td>__lasx_xvsigncov_h</td>
+</tr>
+<tr>
+<td>_mm256_abs_epi32</td>
+<td>__lasx_xvsigncov_w</td>
+</tr>
+<tr>
+<td>_mm256_abs_epi8</td>
+<td>__lasx_xvsigncov_b</td>
+</tr>
+<tr>
+<td>_mm256_add_epi16</td>
+<td>__lasx_xvadd_h</td>
+</tr>
+<tr>
+<td>_mm256_add_epi32</td>
+<td>__lasx_xvadd_w</td>
+</tr>
+<tr>
+<td>_mm256_add_epi64</td>
+<td>__lasx_xvadd_d</td>
+</tr>
+<tr>
+<td>_mm256_add_epi8</td>
+<td>__lasx_xvadd_b</td>
+</tr>
+<tr>
+<td>_mm256_add_pd</td>
+<td>__lasx_xvfadd_d</td>
+</tr>
+<tr>
+<td>_mm256_add_ps</td>
+<td>__lasx_xvfadd_s</td>
+</tr>
+<tr>
+<td>_mm256_adds_epi16</td>
+<td>__lasx_xvsadd_h</td>
+</tr>
+<tr>
+<td>_mm256_adds_epi8</td>
+<td>__lasx_xvsadd_b</td>
+</tr>
+<tr>
+<td>_mm256_adds_epu16</td>
+<td>__lasx_xvsadd_hu</td>
+</tr>
+<tr>
+<td>_mm256_adds_epu8</td>
+<td>__lasx_xvsadd_bu</td>
+</tr>
+<tr>
+<td>_mm256_addsub_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_addsub_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_alignr_epi8</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_and_pd</td>
+<td>__lasx_xvand_v</td>
+</tr>
+<tr>
+<td>_mm256_and_ps</td>
+<td>__lasx_xvand_v</td>
+</tr>
+<tr>
+<td>_mm256_and_si256</td>
+<td>__lasx_xvand_v</td>
+</tr>
+<tr>
+<td>_mm256_andnot_pd</td>
+<td>__lasx_xvandn_v</td>
+</tr>
+<tr>
+<td>_mm256_andnot_ps</td>
+<td>__lasx_xvandn_v</td>
+</tr>
+<tr>
+<td>_mm256_andnot_si256</td>
+<td>__lasx_xvandn_v</td>
+</tr>
+<tr>
+<td>_mm256_avg_epu16</td>
+<td>__lasx_xvavgr_hu</td>
+</tr>
+<tr>
+<td>_mm256_avg_epu8</td>
+<td>__lasx_xvavgr_bu</td>
+</tr>
+<tr>
+<td>_mm256_bcstnebf16_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_bcstnesh_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_blend_epi16</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_blend_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_blend_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_blend_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_blendv_epi8</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_blendv_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_blendv_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_broadcast_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_broadcast_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_broadcast_sd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_broadcast_ss</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_broadcastb_epi8</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_broadcastd_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_broadcastq_epi64</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_broadcastsd_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_broadcastsi128_si256</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_broadcastss_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_broadcastw_epi16</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_bslli_epi128</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_bsrli_epi128</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_castpd128_pd256</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_castpd256_pd128</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_castpd_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_castpd_si256</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_castps128_ps256</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_castps256_ps128</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_castps_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_castps_si256</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_castsi128_si256</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_castsi256_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_castsi256_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_castsi256_si128</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_ceil_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_ceil_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_cmp_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_cmp_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_cmpeq_epi16</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_cmpeq_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_cmpeq_epi64</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_cmpeq_epi8</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_cmpgt_epi16</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_cmpgt_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_cmpgt_epi64</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_cmpgt_epi8</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_cvtepi16_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_cvtepi16_epi64</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_cvtepi32_epi64</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_cvtepi32_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_cvtepi32_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_cvtepi8_epi16</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_cvtepi8_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_cvtepi8_epi64</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_cvtepu16_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_cvtepu16_epi64</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_cvtepu32_epi64</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_cvtepu8_epi16</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_cvtepu8_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_cvtepu8_epi64</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_cvtneebf16_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_cvtneeph_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_cvtneobf16_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_cvtneoph_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_cvtneps_avx_pbh</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_cvtneps_pbh</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_cvtpd_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_cvtpd_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_cvtph_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_cvtps_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_cvtps_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_cvtps_ph</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_cvtsd_f64</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_cvtsi256_si32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_cvtss_f32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_cvttpd_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_cvttps_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_div_pd</td>
+<td>__lasx_xvfdiv_d</td>
+</tr>
+<tr>
+<td>_mm256_div_ps</td>
+<td>__lasx_xvfdiv_s</td>
+</tr>
+<tr>
+<td>_mm256_dp_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_dpbssd_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_dpbssds_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_dpbsud_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_dpbsuds_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_dpbusd_avx_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_dpbusd_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_dpbusds_avx_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_dpbusds_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_dpbuud_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_dpbuuds_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_dpwssd_avx_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_dpwssd_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_dpwssds_avx_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_dpwssds_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_dpwsud_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_dpwsuds_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_dpwusd_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_dpwusds_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_dpwuud_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_dpwuuds_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_extract_epi16</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_extract_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_extract_epi64</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_extract_epi8</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_extractf128_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_extractf128_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_extractf128_si256</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_extracti128_si256</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_floor_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_floor_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_fmadd_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_fmadd_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_fmaddsub_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_fmaddsub_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_fmsub_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_fmsub_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_fmsubadd_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_fmsubadd_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_fnmadd_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_fnmadd_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_fnmsub_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_fnmsub_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_hadd_epi16</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_hadd_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_hadd_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_hadd_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_hadds_epi16</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_hsub_epi16</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_hsub_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_hsub_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_hsub_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_hsubs_epi16</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_i32gather_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_i32gather_epi64</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_i32gather_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_i32gather_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_i64gather_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_i64gather_epi64</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_i64gather_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_i64gather_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_insert_epi16</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_insert_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_insert_epi64</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_insert_epi8</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_insertf128_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_insertf128_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_insertf128_si256</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_inserti128_si256</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_lddqu_si256</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_load_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_load_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_load_si256</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_loadu2_m128d</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_loadu2_m128i</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_loadu2_m128</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_loadu_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_loadu_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_loadu_si256</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_madd52hi_avx_epu64</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_madd52hi_epu64</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_madd52lo_avx_epu64</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_madd52lo_epu64</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_madd_epi16</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_maddubs_epi16</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_mask_i32gather_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_mask_i32gather_epi64</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_mask_i32gather_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_mask_i32gather_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_mask_i64gather_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_mask_i64gather_epi64</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_mask_i64gather_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_mask_i64gather_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_maskload_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_maskload_epi64</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_maskload_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_maskload_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_maskstore_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_maskstore_epi64</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_maskstore_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_maskstore_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_max_epi16</td>
+<td>__lasx_xvmax_h</td>
+</tr>
+<tr>
+<td>_mm256_max_epi32</td>
+<td>__lasx_xvmax_w</td>
+</tr>
+<tr>
+<td>_mm256_max_epi8</td>
+<td>__lasx_xvmax_b</td>
+</tr>
+<tr>
+<td>_mm256_max_epu16</td>
+<td>__lasx_xvmax_hu</td>
+</tr>
+<tr>
+<td>_mm256_max_epu32</td>
+<td>__lasx_xvmax_wu</td>
+</tr>
+<tr>
+<td>_mm256_max_epu8</td>
+<td>__lasx_xvmax_bu</td>
+</tr>
+<tr>
+<td>_mm256_max_pd</td>
+<td>__lasx_xvfmax_d</td>
+</tr>
+<tr>
+<td>_mm256_max_ps</td>
+<td>__lasx_xvfmax_s</td>
+</tr>
+<tr>
+<td>_mm256_min_epi16</td>
+<td>__lasx_xvmin_h</td>
+</tr>
+<tr>
+<td>_mm256_min_epi32</td>
+<td>__lasx_xvmin_w</td>
+</tr>
+<tr>
+<td>_mm256_min_epi8</td>
+<td>__lasx_xvmin_b</td>
+</tr>
+<tr>
+<td>_mm256_min_epu16</td>
+<td>__lasx_xvmin_hu</td>
+</tr>
+<tr>
+<td>_mm256_min_epu32</td>
+<td>__lasx_xvmin_wu</td>
+</tr>
+<tr>
+<td>_mm256_min_epu8</td>
+<td>__lasx_xvmin_bu</td>
+</tr>
+<tr>
+<td>_mm256_min_pd</td>
+<td>__lasx_xvfmin_d</td>
+</tr>
+<tr>
+<td>_mm256_min_ps</td>
+<td>__lasx_xvfmin_s</td>
+</tr>
+<tr>
+<td>_mm256_movedup_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_movehdup_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_moveldup_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_movemask_epi8</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_movemask_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_movemask_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_mpsadbw_epu8</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_mul_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_mul_epu32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_mul_pd</td>
+<td>__lasx_xvfmul_d</td>
+</tr>
+<tr>
+<td>_mm256_mul_ps</td>
+<td>__lasx_xvfmul_s</td>
+</tr>
+<tr>
+<td>_mm256_mulhi_epi16</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_mulhi_epu16</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_mulhrs_epi16</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_mullo_epi16</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_mullo_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_or_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_or_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_or_si256</td>
+<td>__lasx_xvor_v</td>
+</tr>
+<tr>
+<td>_mm256_packs_epi16</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_packs_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_packus_epi16</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_packus_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_permute2f128_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_permute2f128_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_permute2f128_si256</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_permute2x128_si256</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_permute4x64_epi64</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_permute4x64_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_permute_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_permute_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_permutevar8x32_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_permutevar8x32_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_permutevar_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_permutevar_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_rcp_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_round_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_round_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_rsqrt_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_sad_epu8</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_set1_epi16</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_set1_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_set1_epi64x</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_set1_epi8</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_set1_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_set1_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_set_epi16</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_set_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_set_epi64x</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_set_epi8</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_set_m128d</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_set_m128i</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_set_m128</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_set_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_set_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_setr_epi16</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_setr_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_setr_epi64x</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_setr_epi8</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_setr_m128d</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_setr_m128i</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_setr_m128</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_setr_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_setr_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_setzero_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_setzero_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_setzero_si256</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_sha512msg1_epi64</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_sha512msg2_epi64</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_sha512rnds2_epi64</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_shuffle_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_shuffle_epi8</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_shuffle_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_shuffle_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_shufflehi_epi16</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_shufflelo_epi16</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_sign_epi16</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_sign_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_sign_epi8</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_sll_epi16</td>
+<td>__lasx_xvsll_h</td>
+</tr>
+<tr>
+<td>_mm256_sll_epi32</td>
+<td>__lasx_xvsll_w</td>
+</tr>
+<tr>
+<td>_mm256_sll_epi64</td>
+<td>__lasx_xvsll_d</td>
+</tr>
+<tr>
+<td>_mm256_slli_epi16</td>
+<td>__lasx_xvslli_h</td>
+</tr>
+<tr>
+<td>_mm256_slli_epi32</td>
+<td>__lasx_xvslli_w</td>
+</tr>
+<tr>
+<td>_mm256_slli_epi64</td>
+<td>__lasx_xvslli_d</td>
+</tr>
+<tr>
+<td>_mm256_slli_si256</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_sllv_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_sllv_epi64</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_sm4key4_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_sm4rnds4_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_sqrt_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_sqrt_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_sra_epi16</td>
+<td>__lasx_xvsra_h</td>
+</tr>
+<tr>
+<td>_mm256_sra_epi32</td>
+<td>__lasx_xvsra_w</td>
+</tr>
+<tr>
+<td>_mm256_srai_epi16</td>
+<td>__lasx_xvsrai_h</td>
+</tr>
+<tr>
+<td>_mm256_srai_epi32</td>
+<td>__lasx_xvsrai_w</td>
+</tr>
+<tr>
+<td>_mm256_srav_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_srl_epi16</td>
+<td>__lasx_xvsrl_h</td>
+</tr>
+<tr>
+<td>_mm256_srl_epi32</td>
+<td>__lasx_xvsrl_w</td>
+</tr>
+<tr>
+<td>_mm256_srl_epi64</td>
+<td>__lasx_xvsrl_d</td>
+</tr>
+<tr>
+<td>_mm256_srli_epi16</td>
+<td>__lasx_xvsrli_h</td>
+</tr>
+<tr>
+<td>_mm256_srli_epi32</td>
+<td>__lasx_xvsrli_w</td>
+</tr>
+<tr>
+<td>_mm256_srli_epi64</td>
+<td>__lasx_xvsrli_d</td>
+</tr>
+<tr>
+<td>_mm256_srli_si256</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_srlv_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_srlv_epi64</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_store_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_store_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_store_si256</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_storeu2_m128d</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_storeu2_m128i</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_storeu2_m128</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_storeu_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_storeu_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_storeu_si256</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_stream_load_si256</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_stream_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_stream_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_stream_si256</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_sub_epi16</td>
+<td>__lasx_xvsub_h</td>
+</tr>
+<tr>
+<td>_mm256_sub_epi32</td>
+<td>__lasx_xvsub_w</td>
+</tr>
+<tr>
+<td>_mm256_sub_epi64</td>
+<td>__lasx_xvsub_d</td>
+</tr>
+<tr>
+<td>_mm256_sub_epi8</td>
+<td>__lasx_xvsub_b</td>
+</tr>
+<tr>
+<td>_mm256_sub_pd</td>
+<td>__lasx_xvfsub_d</td>
+</tr>
+<tr>
+<td>_mm256_sub_ps</td>
+<td>__lasx_xvfsub_s</td>
+</tr>
+<tr>
+<td>_mm256_subs_epi16</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_subs_epi8</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_subs_epu16</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_subs_epu8</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_testc_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_testc_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_testc_si256</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_testnzc_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_testnzc_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_testnzc_si256</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_testz_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_testz_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_testz_si256</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_undefined_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_undefined_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_undefined_si256</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_unpackhi_epi16</td>
+<td>__lasx_xvilvh_h</td>
+</tr>
+<tr>
+<td>_mm256_unpackhi_epi32</td>
+<td>__lasx_xvilvh_w</td>
+</tr>
+<tr>
+<td>_mm256_unpackhi_epi64</td>
+<td>__lasx_xvilvh_d</td>
+</tr>
+<tr>
+<td>_mm256_unpackhi_epi8</td>
+<td>__lasx_xvilvh_b</td>
+</tr>
+<tr>
+<td>_mm256_unpackhi_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_unpackhi_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_unpacklo_epi16</td>
+<td>__lasx_xvilvl_h</td>
+</tr>
+<tr>
+<td>_mm256_unpacklo_epi32</td>
+<td>__lasx_xvilvl_w</td>
+</tr>
+<tr>
+<td>_mm256_unpacklo_epi64</td>
+<td>__lasx_xvilvl_d</td>
+</tr>
+<tr>
+<td>_mm256_unpacklo_epi8</td>
+<td>__lasx_xvilvl_b</td>
+</tr>
+<tr>
+<td>_mm256_unpacklo_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_unpacklo_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_xor_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_xor_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_xor_si256</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_zeroall</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_zeroupper</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_zextpd128_pd256</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_zextps128_ps256</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm256_zextsi128_si256</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_bcstnebf16_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_bcstnesh_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_blend_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_broadcast_ss</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_broadcastb_epi8</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_broadcastd_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_broadcastq_epi64</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_broadcastsd_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_broadcastsi128_si256</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_broadcastss_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_broadcastw_epi16</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_cmp_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_cmp_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_cmp_sd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_cmp_ss</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_cvtneebf16_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_cvtneeph_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_cvtneobf16_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_cvtneoph_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_cvtneps_avx_pbh</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_cvtneps_pbh</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_cvtph_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_cvtps_ph</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_dpbssd_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_dpbssds_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_dpbsud_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_dpbsuds_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_dpbusd_avx_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_dpbusd_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_dpbusds_avx_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_dpbusds_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_dpbuud_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_dpbuuds_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_dpwssd_avx_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_dpwssd_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_dpwssds_avx_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_dpwssds_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_dpwsud_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_dpwsuds_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_dpwusd_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_dpwusds_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_dpwuud_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_dpwuuds_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_fmadd_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_fmadd_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_fmadd_sd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_fmadd_ss</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_fmaddsub_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_fmaddsub_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_fmsub_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_fmsub_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_fmsub_sd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_fmsub_ss</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_fmsubadd_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_fmsubadd_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_fnmadd_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_fnmadd_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_fnmadd_sd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_fnmadd_ss</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_fnmsub_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_fnmsub_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_fnmsub_sd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_fnmsub_ss</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_i32gather_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_i32gather_epi64</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_i32gather_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_i32gather_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_i64gather_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_i64gather_epi64</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_i64gather_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_i64gather_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_madd52hi_avx_epu64</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_madd52hi_epu64</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_madd52lo_avx_epu64</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_madd52lo_epu64</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_mask_i32gather_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_mask_i32gather_epi64</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_mask_i32gather_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_mask_i32gather_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_mask_i64gather_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_mask_i64gather_epi64</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_mask_i64gather_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_mask_i64gather_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_maskload_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_maskload_epi64</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_maskload_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_maskload_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_maskstore_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_maskstore_epi64</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_maskstore_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_maskstore_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_permute_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_permute_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_permutevar_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_permutevar_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_sllv_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_sllv_epi64</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_sm3msg1_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_sm3msg2_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_sm3rnds2_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_sm4key4_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_sm4rnds4_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_srav_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_srlv_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_srlv_epi64</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_testc_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_testc_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_testnzc_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_testnzc_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_testz_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_testz_ps</td>
+<td></td>
+</tr>
+</tbody>
+</table>
+<p>The list of AVX intrinsics came from <a href="https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htm">Intel Intrinsics Guide</a>.</p>
+              
+            </div>
+          </div><footer>
+    <div class="rst-footer-buttons" role="navigation" aria-label="Footer Navigation">
+        <a href="../latency_throughput/" class="btn btn-neutral float-left" title="Latency and Throughput of Instructions"><span class="icon icon-circle-arrow-left"></span> Previous</a>
+        <a href="../migrating_sse/" class="btn btn-neutral float-right" title="Migrating from SSE to LSX">Next <span class="icon icon-circle-arrow-right"></span></a>
+    </div>
+
+  <hr/>
+
+  <div role="contentinfo">
+    <!-- Copyright etc -->
+      <p>Copyright &copy; 2023 Jiajie Chen</p>
+  </div>
+
+  Built with <a href="https://www.mkdocs.org/">MkDocs</a> using a <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
+</footer>
+          
+        </div>
+      </div>
+
+    </section>
+
+  </div>
+
+  <div class="rst-versions" role="note" aria-label="Versions">
+  <span class="rst-current-version" data-toggle="rst-current-version">
+    
+    
+      <span><a href="../latency_throughput/" style="color: #fcfcfc">&laquo; Previous</a></span>
+    
+    
+      <span><a href="../migrating_sse/" style="color: #fcfcfc">Next &raquo;</a></span>
+    
+  </span>
+</div>
+    <script src="../js/jquery-3.6.0.min.js"></script>
+    <script>var base_url = "..";</script>
+    <script src="../js/theme_extra.js"></script>
+    <script src="../js/theme.js"></script>
+    <script>
+        jQuery(function () {
+            SphinxRtdTheme.Navigation.enable(true);
+        });
+    </script>
+
+</body>
+</html>
diff --git a/migrating_sse/index.html b/migrating_sse/index.html
new file mode 100644
index 00000000..f0ca3bb1
--- /dev/null
+++ b/migrating_sse/index.html
@@ -0,0 +1,1997 @@
+<!DOCTYPE html>
+<html class="writer-html5" lang="en" >
+<head>
+    <meta charset="utf-8" />
+    <meta http-equiv="X-UA-Compatible" content="IE=edge" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" /><link rel="canonical" href="https://jia.je/unofficial-loongarch-intrinsics-guide/migrating_sse/" />
+      <link rel="shortcut icon" href="../img/favicon.ico" />
+    <title>Migrating from SSE to LSX - Unofficial LoongArch Intrinsics Guide</title>
+    <link rel="stylesheet" href="../css/theme.css" />
+    <link rel="stylesheet" href="../css/theme_extra.css" />
+        <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.8.0/styles/github.min.css" />
+        <link href="../main.css" rel="stylesheet" />
+    
+      <script>
+        // Current page data
+        var mkdocs_page_name = "Migrating from SSE to LSX";
+        var mkdocs_page_input_path = "migrating_sse.md";
+        var mkdocs_page_url = "/unofficial-loongarch-intrinsics-guide/migrating_sse/";
+      </script>
+    
+    <!--[if lt IE 9]>
+      <script src="../js/html5shiv.min.js"></script>
+    <![endif]-->
+      <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.8.0/highlight.min.js"></script>
+      <script>hljs.highlightAll();</script> 
+</head>
+
+<body class="wy-body-for-nav" role="document">
+
+  <div class="wy-grid-for-nav">
+    <nav data-toggle="wy-nav-shift" class="wy-nav-side stickynav">
+    <div class="wy-side-scroll">
+      <div class="wy-side-nav-search">
+          <a href=".." class="icon icon-home"> Unofficial LoongArch Intrinsics Guide
+        </a>
+      </div>
+
+      <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="..">Unofficial LoongArch Intrinsics Guide</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../latency_throughput/">Latency and Throughput of Instructions</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../migrating_avx/">Migrating from AVX to LASX</a>
+                </li>
+              </ul>
+              <ul class="current">
+                <li class="toctree-l1 current"><a class="reference internal current" href="./">Migrating from SSE to LSX</a>
+    <ul class="current">
+    </ul>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../viewer/">Browse All Intrinsics</a>
+                </li>
+              </ul>
+              <p class="caption"><span class="caption-text">Lasx</span></p>
+              <ul>
+                  <li class="toctree-l1"><a class="reference internal" href="../lasx/bitwise_operations/">Bitwise Operations</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lasx/branch/">Branch</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lasx/float_comparison/">Floating Point Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lasx/float_computation/">Floating Point Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lasx/float_conversion/">Floating Point Conversion</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lasx/float_misc/">Floating Point Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lasx/fma/">Fused Multiply-Add</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lasx/integer_comparison/">Integer Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lasx/integer_computation/">Integer Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lasx/logical/">Logical</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lasx/memory/">Memory Load & Store</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lasx/misc/">Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lasx/permutation/">Permutation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lasx/shift/">Shift</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lasx/shuffling/">Shuffling</a>
+                  </li>
+              </ul>
+              <p class="caption"><span class="caption-text">Lsx</span></p>
+              <ul>
+                  <li class="toctree-l1"><a class="reference internal" href="../lsx/bitwise_operations/">Bitwise Operations</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lsx/branch/">Branch</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lsx/float_comparison/">Floating Point Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lsx/float_computation/">Floating Point Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lsx/float_conversion/">Floating Point Conversion</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lsx/float_misc/">Floating Point Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lsx/fma/">Fused Multiply-Add</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lsx/integer_comparison/">Integer Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lsx/integer_computation/">Integer Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lsx/logical/">Logical</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lsx/memory/">Memory Load & Store</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lsx/misc/">Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lsx/permutation/">Permutation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lsx/shift/">Shift</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lsx/shuffling/">Shuffling</a>
+                  </li>
+              </ul>
+      </div>
+    </div>
+    </nav>
+
+    <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
+      <nav class="wy-nav-top" role="navigation" aria-label="Mobile navigation menu">
+          <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
+          <a href="..">Unofficial LoongArch Intrinsics Guide</a>
+        
+      </nav>
+      <div class="wy-nav-content">
+        <div class="rst-content"><div role="navigation" aria-label="breadcrumbs navigation">
+  <ul class="wy-breadcrumbs">
+    <li><a href=".." class="icon icon-home" aria-label="Docs"></a></li>
+      <li class="breadcrumb-item active">Migrating from SSE to LSX</li>
+    <li class="wy-breadcrumbs-aside">
+    </li>
+  </ul>
+  <hr/>
+</div>
+          <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
+            <div class="section" itemprop="articleBody">
+              
+                <h1 id="migrating-from-sse-to-lsx">Migrating from SSE to LSX</h1>
+<p>SSE is a 128-bit SIMD extension to X86. It is possible to migrate existing SSE code to leverage LoongArch LSX extension by rewriting the intrinsics or instructions manually, or by using tools like <a href="https://github.com/simd-everywhere/simde">SIMD Everywhere</a> to implement SSE intrinsics with LSX counterparts. But to unleash the full performance, you may want to port your code to LSX manually.</p>
+<p>Thankfully, LSX intrinsics adopt the same type as SSE: you can use the following familiar types for SIMD:</p>
+<ul>
+<li>__m128: 128-bit vector of single precision floating point numbers</li>
+<li>__m128d: 128-bit vector of double precision floating point numbers</li>
+<li>__m128i: 128-bit vector of integers, which can be of any width</li>
+</ul>
+<p>Here is a table of a mapping from SSE intrinsics to their LSX counterpart (WIP):</p>
+<table>
+<thead>
+<tr>
+<th>SSE</th>
+<th>LSX</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>_mm_abs_epi16</td>
+<td>__lsx_vsigncov_h</td>
+</tr>
+<tr>
+<td>_mm_abs_epi32</td>
+<td>__lsx_vsigncov_w</td>
+</tr>
+<tr>
+<td>_mm_abs_epi8</td>
+<td>__lsx_vsigncov_b</td>
+</tr>
+<tr>
+<td>_mm_add_epi16</td>
+<td>__lsx_vadd_h</td>
+</tr>
+<tr>
+<td>_mm_add_epi32</td>
+<td>__lsx_vadd_w</td>
+</tr>
+<tr>
+<td>_mm_add_epi64</td>
+<td>__lsx_vadd_d</td>
+</tr>
+<tr>
+<td>_mm_add_epi8</td>
+<td>__lsx_vadd_b</td>
+</tr>
+<tr>
+<td>_mm_add_pd</td>
+<td>__lsx_vfadd_d</td>
+</tr>
+<tr>
+<td>_mm_add_ps</td>
+<td>__lsx_vfadd_s</td>
+</tr>
+<tr>
+<td>_mm_add_sd</td>
+<td>__lsx_vfadd_d + __lsx_vextrins_d</td>
+</tr>
+<tr>
+<td>_mm_add_ss</td>
+<td>__lsx_vfadd_s + __lsx_vextrins_w</td>
+</tr>
+<tr>
+<td>_mm_adds_epi16</td>
+<td>__lsx_vsadd_h</td>
+</tr>
+<tr>
+<td>_mm_adds_epi8</td>
+<td>__lsx_vsadd_b</td>
+</tr>
+<tr>
+<td>_mm_adds_epu16</td>
+<td>__lsx_vsadd_hu</td>
+</tr>
+<tr>
+<td>_mm_adds_epu8</td>
+<td>__lsx_vsadd_bu</td>
+</tr>
+<tr>
+<td>_mm_addsub_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_addsub_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_alignr_epi8</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_and_pd</td>
+<td>__lsx_vand_v</td>
+</tr>
+<tr>
+<td>_mm_and_ps</td>
+<td>__lsx_vand_v</td>
+</tr>
+<tr>
+<td>_mm_and_si128</td>
+<td>__lsx_vand_v</td>
+</tr>
+<tr>
+<td>_mm_andnot_pd</td>
+<td>__lsx_vandn_v</td>
+</tr>
+<tr>
+<td>_mm_andnot_ps</td>
+<td>__lsx_vandn_v</td>
+</tr>
+<tr>
+<td>_mm_andnot_si128</td>
+<td>__lsx_vandn_v</td>
+</tr>
+<tr>
+<td>_mm_avg_epu16</td>
+<td>__lsx_vavgr_hu</td>
+</tr>
+<tr>
+<td>_mm_avg_epu8</td>
+<td>__lsx_vavgr_bu</td>
+</tr>
+<tr>
+<td>_mm_blend_epi16</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_blend_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_blend_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_blendv_epi8</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_blendv_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_blendv_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_bslli_si128</td>
+<td>__lsx_vbsll_v</td>
+</tr>
+<tr>
+<td>_mm_bsrli_si128</td>
+<td>__lsx_vbsrl_v</td>
+</tr>
+<tr>
+<td>_mm_castpd_ps</td>
+<td>type conversion</td>
+</tr>
+<tr>
+<td>_mm_castpd_si128</td>
+<td>type conversion</td>
+</tr>
+<tr>
+<td>_mm_castps_pd</td>
+<td>type conversion</td>
+</tr>
+<tr>
+<td>_mm_castps_si128</td>
+<td>type conversion</td>
+</tr>
+<tr>
+<td>_mm_castsi128_pd</td>
+<td>type conversion</td>
+</tr>
+<tr>
+<td>_mm_castsi128_ps</td>
+<td>type conversion</td>
+</tr>
+<tr>
+<td>_mm_ceil_pd</td>
+<td>__lsx_vfrintrp_d</td>
+</tr>
+<tr>
+<td>_mm_ceil_ps</td>
+<td>__lsx_vfrintrp_s</td>
+</tr>
+<tr>
+<td>_mm_ceil_sd</td>
+<td>__lsx_vfrintrp_d + __lsx_vextrins_d</td>
+</tr>
+<tr>
+<td>_mm_ceil_ss</td>
+<td>__lsx_vfrintrp_s + __lsx_vextrins_w</td>
+</tr>
+<tr>
+<td>_mm_cmpeq_epi16</td>
+<td>__lsx_vseq_h</td>
+</tr>
+<tr>
+<td>_mm_cmpeq_epi32</td>
+<td>__lsx_vseq_w</td>
+</tr>
+<tr>
+<td>_mm_cmpeq_epi64</td>
+<td>__lsx_vseq_d</td>
+</tr>
+<tr>
+<td>_mm_cmpeq_epi8</td>
+<td>__lsx_vseq_b</td>
+</tr>
+<tr>
+<td>_mm_cmpeq_pd</td>
+<td>__lsx_vfcmp_ceq_d</td>
+</tr>
+<tr>
+<td>_mm_cmpeq_ps</td>
+<td>__lsx_vfcmp_ceq_s</td>
+</tr>
+<tr>
+<td>_mm_cmpeq_sd</td>
+<td>__lsx_vfcmp_ceq_d + __lsx_vextrins_d</td>
+</tr>
+<tr>
+<td>_mm_cmpeq_ss</td>
+<td>__lsx_vfcmp_ceq_s + __lsx_vextrins_w</td>
+</tr>
+<tr>
+<td>_mm_cmpestra</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_cmpestrc</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_cmpestri</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_cmpestrm</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_cmpestro</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_cmpestrs</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_cmpestrz</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_cmpge_pd</td>
+<td>__lsx_vfcmp_cle_d</td>
+</tr>
+<tr>
+<td>_mm_cmpge_ps</td>
+<td>__lsx_vfcmp_cle_s</td>
+</tr>
+<tr>
+<td>_mm_cmpge_sd</td>
+<td>__lsx_vfcmp_cle_d + __lsx_vextrins_d</td>
+</tr>
+<tr>
+<td>_mm_cmpge_ss</td>
+<td>__lsx_vfcmp_cle_s + __lsx_vextrins_w</td>
+</tr>
+<tr>
+<td>_mm_cmpgt_epi16</td>
+<td>__lsx_vslt_h</td>
+</tr>
+<tr>
+<td>_mm_cmpgt_epi32</td>
+<td>__lsx_vslt_w</td>
+</tr>
+<tr>
+<td>_mm_cmpgt_epi64</td>
+<td>__lsx_vslt_d</td>
+</tr>
+<tr>
+<td>_mm_cmpgt_epi8</td>
+<td>__lsx_vslt_b</td>
+</tr>
+<tr>
+<td>_mm_cmpgt_pd</td>
+<td>__lsx_vfcmp_clt_d</td>
+</tr>
+<tr>
+<td>_mm_cmpgt_ps</td>
+<td>__lsx_vfcmp_clt_s</td>
+</tr>
+<tr>
+<td>_mm_cmpgt_sd</td>
+<td>__lsx_vfcmp_clt_d + __lsx_vextrins_d</td>
+</tr>
+<tr>
+<td>_mm_cmpgt_ss</td>
+<td>__lsx_vfcmp_clt_s + __lsx_vextrins_w</td>
+</tr>
+<tr>
+<td>_mm_cmpistra</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_cmpistrc</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_cmpistri</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_cmpistrm</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_cmpistro</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_cmpistrs</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_cmpistrz</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_cmple_pd</td>
+<td>__lsx_vfcmp_cle_d</td>
+</tr>
+<tr>
+<td>_mm_cmple_ps</td>
+<td>__lsx_vfcmp_cle_s</td>
+</tr>
+<tr>
+<td>_mm_cmple_sd</td>
+<td>__lsx_vfcmp_cle_d + __lsx_vextrins_d</td>
+</tr>
+<tr>
+<td>_mm_cmple_ss</td>
+<td>__lsx_vfcmp_cle_s + __lsx_vextrins_w</td>
+</tr>
+<tr>
+<td>_mm_cmplt_epi16</td>
+<td>__lsx_vslt_h</td>
+</tr>
+<tr>
+<td>_mm_cmplt_epi32</td>
+<td>__lsx_vslt_w</td>
+</tr>
+<tr>
+<td>_mm_cmplt_epi8</td>
+<td>__lsx_vslt_b</td>
+</tr>
+<tr>
+<td>_mm_cmplt_pd</td>
+<td>__lsx_vfcmp_clt_d</td>
+</tr>
+<tr>
+<td>_mm_cmplt_ps</td>
+<td>__lsx_vfcmp_clt_s</td>
+</tr>
+<tr>
+<td>_mm_cmplt_sd</td>
+<td>__lsx_vfcmp_clt_d + __lsx_vextrins_d</td>
+</tr>
+<tr>
+<td>_mm_cmplt_ss</td>
+<td>__lsx_vfcmp_clt_s + __lsx_vextrins_w</td>
+</tr>
+<tr>
+<td>_mm_cmpneq_pd</td>
+<td>__lsx_vfcmp_cune_d</td>
+</tr>
+<tr>
+<td>_mm_cmpneq_ps</td>
+<td>__lsx_vfcmp_cune_s</td>
+</tr>
+<tr>
+<td>_mm_cmpneq_sd</td>
+<td>__lsx_vfcmp_cune_d + __lsx_vextrins_d</td>
+</tr>
+<tr>
+<td>_mm_cmpneq_ss</td>
+<td>__lsx_vfcmp_cune_s + __lsx_vextrins_w</td>
+</tr>
+<tr>
+<td>_mm_cmpnge_pd</td>
+<td>__lsx_vfcmp_cult_d</td>
+</tr>
+<tr>
+<td>_mm_cmpnge_ps</td>
+<td>__lsx_vfcmp_cult_s</td>
+</tr>
+<tr>
+<td>_mm_cmpnge_sd</td>
+<td>__lsx_vfcmp_cult_d + __lsx_vextrins_d</td>
+</tr>
+<tr>
+<td>_mm_cmpnge_ss</td>
+<td>__lsx_vfcmp_cult_s + __lsx_vextrins_w</td>
+</tr>
+<tr>
+<td>_mm_cmpngt_pd</td>
+<td>__lsx_vfcmp_cule_d</td>
+</tr>
+<tr>
+<td>_mm_cmpngt_ps</td>
+<td>__lsx_vfcmp_cule_s</td>
+</tr>
+<tr>
+<td>_mm_cmpngt_sd</td>
+<td>__lsx_vfcmp_cule_d + __lsx_vextrins_d</td>
+</tr>
+<tr>
+<td>_mm_cmpngt_ss</td>
+<td>__lsx_vfcmp_cule_s + __lsx_vextrins_w</td>
+</tr>
+<tr>
+<td>_mm_cmpnle_pd</td>
+<td>__lsx_vfcmp_cult_d</td>
+</tr>
+<tr>
+<td>_mm_cmpnle_ps</td>
+<td>__lsx_vfcmp_cult_s</td>
+</tr>
+<tr>
+<td>_mm_cmpnle_sd</td>
+<td>__lsx_vfcmp_cult_d + __lsx_vextrins_d</td>
+</tr>
+<tr>
+<td>_mm_cmpnle_ss</td>
+<td>__lsx_vfcmp_cult_s + __lsx_vextrins_w</td>
+</tr>
+<tr>
+<td>_mm_cmpnlt_pd</td>
+<td>__lsx_vfcmp_cule_d</td>
+</tr>
+<tr>
+<td>_mm_cmpnlt_ps</td>
+<td>__lsx_vfcmp_cule_s</td>
+</tr>
+<tr>
+<td>_mm_cmpnlt_sd</td>
+<td>__lsx_vfcmp_cule_d + __lsx_vextrins_d</td>
+</tr>
+<tr>
+<td>_mm_cmpnlt_ss</td>
+<td>__lsx_vfcmp_cule_s + __lsx_vextrins_w</td>
+</tr>
+<tr>
+<td>_mm_cmpord_pd</td>
+<td>__lsx_vfcmp_cor_d</td>
+</tr>
+<tr>
+<td>_mm_cmpord_ps</td>
+<td>__lsx_vfcmp_cor_s</td>
+</tr>
+<tr>
+<td>_mm_cmpord_sd</td>
+<td>__lsx_vfcmp_cor_d + __lsx_vextrins_d</td>
+</tr>
+<tr>
+<td>_mm_cmpord_ss</td>
+<td>__lsx_vfcmp_cor_s + __lsx_vextrins_w</td>
+</tr>
+<tr>
+<td>_mm_cmpunord_pd</td>
+<td>__lsx_vfcmp_cun_d</td>
+</tr>
+<tr>
+<td>_mm_cmpunord_ps</td>
+<td>__lsx_vfcmp_cun_s</td>
+</tr>
+<tr>
+<td>_mm_cmpunord_sd</td>
+<td>__lsx_vfcmp_cun_d + __lsx_vextrins_d</td>
+</tr>
+<tr>
+<td>_mm_cmpunord_ss</td>
+<td>__lsx_vfcmp_cun_s + __lsx_vextrins_w</td>
+</tr>
+<tr>
+<td>_mm_comieq_sd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_comieq_ss</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_comige_sd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_comige_ss</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_comigt_sd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_comigt_ss</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_comile_sd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_comile_ss</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_comilt_sd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_comilt_ss</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_comineq_sd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_comineq_ss</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_cvt_pi2ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_cvt_ps2pi</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_cvt_si2ss</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_cvt_ss2si</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_cvtepi16_epi32</td>
+<td>__lsx_vsllwil_w_h</td>
+</tr>
+<tr>
+<td>_mm_cvtepi16_epi64</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_cvtepi32_epi64</td>
+<td>__lsx_vsllwil_d_w</td>
+</tr>
+<tr>
+<td>_mm_cvtepi32_pd</td>
+<td>__lsx_vffintl_d_w</td>
+</tr>
+<tr>
+<td>_mm_cvtepi32_ps</td>
+<td>__lsx_vffint_s_w</td>
+</tr>
+<tr>
+<td>_mm_cvtepi8_epi16</td>
+<td>__lsx_vsllwil_h_b</td>
+</tr>
+<tr>
+<td>_mm_cvtepi8_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_cvtepi8_epi64</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_cvtepu16_epi32</td>
+<td>__lsx_vsllwil_wu_hu</td>
+</tr>
+<tr>
+<td>_mm_cvtepu16_epi64</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_cvtepu32_epi64</td>
+<td>__lsx_vsllwil_du_wu</td>
+</tr>
+<tr>
+<td>_mm_cvtepu8_epi16</td>
+<td>__lsx_vsllwil_hu_bu</td>
+</tr>
+<tr>
+<td>_mm_cvtepu8_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_cvtepu8_epi64</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_cvtpd_epi32</td>
+<td>__lsx_vftint_w_d</td>
+</tr>
+<tr>
+<td>_mm_cvtpd_pi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_cvtpd_ps</td>
+<td>__lsx_vfcvt_s_d</td>
+</tr>
+<tr>
+<td>_mm_cvtpi16_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_cvtpi32_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_cvtpi32_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_cvtpi32x2_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_cvtpi8_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_cvtps_epi32</td>
+<td>__lsx_vftint_w_s</td>
+</tr>
+<tr>
+<td>_mm_cvtps_pd</td>
+<td>__lsx_vfcvtl_d_s</td>
+</tr>
+<tr>
+<td>_mm_cvtps_pi16</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_cvtps_pi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_cvtps_pi8</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_cvtpu16_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_cvtpu8_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_cvtsd_f64</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_cvtsd_si32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_cvtsd_si64</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_cvtsd_si64x</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_cvtsd_ss</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_cvtsi128_si32</td>
+<td>__lsx_vpickve2gr_w</td>
+</tr>
+<tr>
+<td>_mm_cvtsi128_si64</td>
+<td>__lsx_vpickve2gr_d</td>
+</tr>
+<tr>
+<td>_mm_cvtsi128_si64x</td>
+<td>__lsx_vpickve2gr_d</td>
+</tr>
+<tr>
+<td>_mm_cvtsi32_sd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_cvtsi32_si128</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_cvtsi32_ss</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_cvtsi64_sd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_cvtsi64_si128</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_cvtsi64_ss</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_cvtsi64x_sd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_cvtsi64x_si128</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_cvtss_f32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_cvtss_sd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_cvtss_si32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_cvtss_si64</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_cvtt_ps2pi</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_cvtt_ss2si</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_cvttpd_epi32</td>
+<td>__lsx_vftint_w_d</td>
+</tr>
+<tr>
+<td>_mm_cvttpd_pi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_cvttps_epi32</td>
+<td>__lsx_vftint_w_s</td>
+</tr>
+<tr>
+<td>_mm_cvttps_pi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_cvttsd_si32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_cvttsd_si64</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_cvttsd_si64x</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_cvttss_si32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_cvttss_si64</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_div_pd</td>
+<td>__lsx_vfdiv_d</td>
+</tr>
+<tr>
+<td>_mm_div_ps</td>
+<td>__lsx_vfdiv_s</td>
+</tr>
+<tr>
+<td>_mm_div_sd</td>
+<td>__lsx_vfdiv_d + __lsx_vextrins_d</td>
+</tr>
+<tr>
+<td>_mm_div_ss</td>
+<td>__lsx_vfdiv_s + __lsx_vextrins_w</td>
+</tr>
+<tr>
+<td>_mm_dp_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_dp_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_extract_epi16</td>
+<td>__lsx_vpickve2gr_h</td>
+</tr>
+<tr>
+<td>_mm_extract_epi32</td>
+<td>__lsx_vpickve2gr_w</td>
+</tr>
+<tr>
+<td>_mm_extract_epi64</td>
+<td>__lsx_vpickve2gr_d</td>
+</tr>
+<tr>
+<td>_mm_extract_epi8</td>
+<td>__lsx_vpickve2gr_b</td>
+</tr>
+<tr>
+<td>_mm_extract_ps</td>
+<td>__lsx_vpickve2gr_w</td>
+</tr>
+<tr>
+<td>_mm_floor_pd</td>
+<td>__lsx_vfrintrm_d</td>
+</tr>
+<tr>
+<td>_mm_floor_ps</td>
+<td>__lsx_vfrintrm_s</td>
+</tr>
+<tr>
+<td>_mm_floor_sd</td>
+<td>__lsx_vfrintrm_d + __lsx_vextrins_d</td>
+</tr>
+<tr>
+<td>_mm_floor_ss</td>
+<td>__lsx_vfrintrm_s + __lsx_vextrins_w</td>
+</tr>
+<tr>
+<td>_mm_hadd_epi16</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_hadd_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_hadd_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_hadd_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_hadds_epi16</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_hsub_epi16</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_hsub_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_hsub_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_hsub_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_hsubs_epi16</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_insert_epi16</td>
+<td>__lsx_vinsgr2vr_h</td>
+</tr>
+<tr>
+<td>_mm_insert_epi32</td>
+<td>__lsx_vinsgr2vr_w</td>
+</tr>
+<tr>
+<td>_mm_insert_epi64</td>
+<td>__lsx_vinsgr2vr_d</td>
+</tr>
+<tr>
+<td>_mm_insert_epi8</td>
+<td>__lsx_vinsgr2vr_b</td>
+</tr>
+<tr>
+<td>_mm_insert_ps</td>
+<td>__lsx_vinsgr2vr_w</td>
+</tr>
+<tr>
+<td>_mm_lddqu_si128</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_load_pd</td>
+<td>__lsx_vld</td>
+</tr>
+<tr>
+<td>_mm_load_pd1</td>
+<td>__lsx_vldrepl_d</td>
+</tr>
+<tr>
+<td>_mm_load_ps</td>
+<td>__lsx_vld</td>
+</tr>
+<tr>
+<td>_mm_load_ps1</td>
+<td>__lsx_vldrepl_w</td>
+</tr>
+<tr>
+<td>_mm_load_sd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_load_si128</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_load_ss</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_load1_pd</td>
+<td>__lsx_vldrepl_d</td>
+</tr>
+<tr>
+<td>_mm_load1_ps</td>
+<td>__lsx_vldrepl_w</td>
+</tr>
+<tr>
+<td>_mm_loaddup_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_loadh_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_loadh_pi</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_loadl_epi64</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_loadl_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_loadl_pi</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_loadr_pd</td>
+<td>__lsx_vld + __lsx_vshuf4i_w</td>
+</tr>
+<tr>
+<td>_mm_loadr_ps</td>
+<td>__lsx_vld + __lsx_vshuf4i_w</td>
+</tr>
+<tr>
+<td>_mm_loadu_pd</td>
+<td>__lsx_vld</td>
+</tr>
+<tr>
+<td>_mm_loadu_ps</td>
+<td>__lsx_vld</td>
+</tr>
+<tr>
+<td>_mm_loadu_si128</td>
+<td>__lsx_vld</td>
+</tr>
+<tr>
+<td>_mm_loadu_si16</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_loadu_si32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_loadu_si64</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_madd_epi16</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_maddubs_epi16</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_maskmoveu_si128</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_max_epi16</td>
+<td>__lsx_vmax_h</td>
+</tr>
+<tr>
+<td>_mm_max_epi32</td>
+<td>__lsx_vmax_w</td>
+</tr>
+<tr>
+<td>_mm_max_epi8</td>
+<td>__lsx_vmax_b</td>
+</tr>
+<tr>
+<td>_mm_max_epu16</td>
+<td>__lsx_vmax_hu</td>
+</tr>
+<tr>
+<td>_mm_max_epu32</td>
+<td>__lsx_vmax_wu</td>
+</tr>
+<tr>
+<td>_mm_max_epu8</td>
+<td>__lsx_vmax_bu</td>
+</tr>
+<tr>
+<td>_mm_max_pd</td>
+<td>__lsx_vfmax_d</td>
+</tr>
+<tr>
+<td>_mm_max_ps</td>
+<td>__lsx_vfmax_s</td>
+</tr>
+<tr>
+<td>_mm_max_sd</td>
+<td>__lsx_vfmax_d + __lsx_vextrins_d</td>
+</tr>
+<tr>
+<td>_mm_max_ss</td>
+<td>__lsx_vfmax_s + __lsx_vextrins_w</td>
+</tr>
+<tr>
+<td>_mm_min_epi16</td>
+<td>__lsx_vmin_h</td>
+</tr>
+<tr>
+<td>_mm_min_epi32</td>
+<td>__lsx_vmin_w</td>
+</tr>
+<tr>
+<td>_mm_min_epi8</td>
+<td>__lsx_vmin_b</td>
+</tr>
+<tr>
+<td>_mm_min_epu16</td>
+<td>__lsx_vmin_hu</td>
+</tr>
+<tr>
+<td>_mm_min_epu32</td>
+<td>__lsx_vmin_wu</td>
+</tr>
+<tr>
+<td>_mm_min_epu8</td>
+<td>__lsx_vmin_bu</td>
+</tr>
+<tr>
+<td>_mm_min_pd</td>
+<td>__lsx_vfmin_d</td>
+</tr>
+<tr>
+<td>_mm_min_ps</td>
+<td>__lsx_vfmin_s</td>
+</tr>
+<tr>
+<td>_mm_min_sd</td>
+<td>__lsx_vfmin_d + __lsx_vextrins_d</td>
+</tr>
+<tr>
+<td>_mm_min_ss</td>
+<td>__lsx_vfmin_s + __lsx_vextrins_w</td>
+</tr>
+<tr>
+<td>_mm_minpos_epu16</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_move_epi64</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_move_sd</td>
+<td>__lsx_vextrins_d</td>
+</tr>
+<tr>
+<td>_mm_move_ss</td>
+<td>__lsx_vextrins_w</td>
+</tr>
+<tr>
+<td>_mm_movedup_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_movehdup_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_movehl_ps</td>
+<td>__lsx_vilvh_d</td>
+</tr>
+<tr>
+<td>_mm_moveldup_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_movelh_ps</td>
+<td>__lsx_vilvl_d</td>
+</tr>
+<tr>
+<td>_mm_movemask_epi8</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_movemask_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_movemask_ps</td>
+<td>__lsx_vmskltz_w + __lsx_vpickve2gr_wu</td>
+</tr>
+<tr>
+<td>_mm_movepi64_pi64</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_movpi64_epi64</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_mpsadbw_epu8</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_mul_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_mul_epu32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_mul_pd</td>
+<td>__lsx_vfmul_d</td>
+</tr>
+<tr>
+<td>_mm_mul_ps</td>
+<td>__lsx_vfmul_s</td>
+</tr>
+<tr>
+<td>_mm_mul_sd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_mul_ss</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_mulhi_epi16</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_mulhi_epu16</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_mulhrs_epi16</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_mullo_epi16</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_mullo_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_or_pd</td>
+<td>__lsx_vor_v</td>
+</tr>
+<tr>
+<td>_mm_or_ps</td>
+<td>__lsx_vor_v</td>
+</tr>
+<tr>
+<td>_mm_or_si128</td>
+<td>__lsx_vor_v</td>
+</tr>
+<tr>
+<td>_mm_packs_epi16</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_packs_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_packus_epi16</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_packus_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_rcp_ps</td>
+<td>__lsx_vfrecip_s</td>
+</tr>
+<tr>
+<td>_mm_rcp_ss</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_round_pd</td>
+<td>__lsx_vfrintr*_d</td>
+</tr>
+<tr>
+<td>_mm_round_ps</td>
+<td>__lsx_vfrintr*_s</td>
+</tr>
+<tr>
+<td>_mm_round_sd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_round_ss</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_rsqrt_ps</td>
+<td>__lsx_vfrsqrt_s</td>
+</tr>
+<tr>
+<td>_mm_rsqrt_ss</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_sad_epu8</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_set_epi16</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_set_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_set_epi64</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_set_epi64x</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_set_epi8</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_set_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_set_pd1</td>
+<td>__lsx_vdrepl_d/__lsx_vreplgr2vd_d</td>
+</tr>
+<tr>
+<td>_mm_set_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_set_ps1</td>
+<td>__lsx_vdrepl_w/__lsx_vreplgr2vr_w</td>
+</tr>
+<tr>
+<td>_mm_set_sd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_set_ss</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_set1_epi16</td>
+<td>__lsx_vreplgr2vr_h</td>
+</tr>
+<tr>
+<td>_mm_set1_epi32</td>
+<td>__lsx_vreplgr2vr_w</td>
+</tr>
+<tr>
+<td>_mm_set1_epi64</td>
+<td>__lsx_vreplgr2vr_d</td>
+</tr>
+<tr>
+<td>_mm_set1_epi64x</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_set1_epi8</td>
+<td>__lsx_vreplgr2vr_b</td>
+</tr>
+<tr>
+<td>_mm_set1_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_set1_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_setr_epi16</td>
+<td>use lsxintrin.h--v8i16 to reverse construction</td>
+</tr>
+<tr>
+<td>_mm_setr_epi32</td>
+<td>use lsxintrin.h--v4i32 to reverse construction</td>
+</tr>
+<tr>
+<td>_mm_setr_epi64</td>
+<td>use lsxintrin.h--v2i64 to reverse construction</td>
+</tr>
+<tr>
+<td>_mm_setr_epi8</td>
+<td>use lsxintrin.h--v16i8 to reverse construction</td>
+</tr>
+<tr>
+<td>_mm_setr_pd</td>
+<td>use lsxintrin.h--v2f64 to reverse construction</td>
+</tr>
+<tr>
+<td>_mm_setr_ps</td>
+<td>use lsxintrin.h--v4f32 to reverse construction</td>
+</tr>
+<tr>
+<td>_mm_setzero_pd</td>
+<td>(__m128d)__lsx_vldi(0)</td>
+</tr>
+<tr>
+<td>_mm_setzero_ps</td>
+<td>(__m128)__lsx_vldi(0)</td>
+</tr>
+<tr>
+<td>_mm_setzero_si128</td>
+<td>__lsx_vldi(0)</td>
+</tr>
+<tr>
+<td>_mm_shuffle_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_shuffle_epi8</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_shuffle_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_shuffle_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_shufflehi_epi16</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_shufflelo_epi16</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_sign_epi16</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_sign_epi32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_sign_epi8</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_sll_epi16</td>
+<td>__lsx_vsll_h</td>
+</tr>
+<tr>
+<td>_mm_sll_epi32</td>
+<td>__lsx_vsll_w</td>
+</tr>
+<tr>
+<td>_mm_sll_epi64</td>
+<td>__lsx_vsll_d</td>
+</tr>
+<tr>
+<td>_mm_slli_epi16</td>
+<td>__lsx_vslli_h</td>
+</tr>
+<tr>
+<td>_mm_slli_epi32</td>
+<td>__lsx_vslli_w</td>
+</tr>
+<tr>
+<td>_mm_slli_epi64</td>
+<td>__lsx_vslli_d</td>
+</tr>
+<tr>
+<td>_mm_slli_si128</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_sqrt_pd</td>
+<td>__lsx_vfsqrt_d</td>
+</tr>
+<tr>
+<td>_mm_sqrt_ps</td>
+<td>__lsx_vfsqrt_s</td>
+</tr>
+<tr>
+<td>_mm_sqrt_sd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_sqrt_ss</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_sra_epi16</td>
+<td>__lsx_vsra_h</td>
+</tr>
+<tr>
+<td>_mm_sra_epi32</td>
+<td>__lsx_vsra_w</td>
+</tr>
+<tr>
+<td>_mm_srai_epi16</td>
+<td>__lsx_vsrai_h</td>
+</tr>
+<tr>
+<td>_mm_srai_epi32</td>
+<td>__lsx_vsrai_w</td>
+</tr>
+<tr>
+<td>_mm_srl_epi16</td>
+<td>__lsx_vsrl_h</td>
+</tr>
+<tr>
+<td>_mm_srl_epi32</td>
+<td>__lsx_vsrl_w</td>
+</tr>
+<tr>
+<td>_mm_srl_epi64</td>
+<td>__lsx_vsrl_d</td>
+</tr>
+<tr>
+<td>_mm_srli_epi16</td>
+<td>__lsx_vsrli_h</td>
+</tr>
+<tr>
+<td>_mm_srli_epi32</td>
+<td>__lsx_vsrli_w</td>
+</tr>
+<tr>
+<td>_mm_srli_epi64</td>
+<td>__lsx_vsrli_d</td>
+</tr>
+<tr>
+<td>_mm_srli_si128</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_store_pd</td>
+<td>__lsx_vst</td>
+</tr>
+<tr>
+<td>_mm_store_pd1</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_store_ps</td>
+<td>__lsx_vst</td>
+</tr>
+<tr>
+<td>_mm_store_ps1</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_store_sd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_store_si128</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_store_ss</td>
+<td>__lsx_vstelm_w</td>
+</tr>
+<tr>
+<td>_mm_store1_pd</td>
+<td>__lsx_vreplvei_d + __lsx_vst</td>
+</tr>
+<tr>
+<td>_mm_store1_ps</td>
+<td>__lsx_vreplvei_w + __lsx_vst</td>
+</tr>
+<tr>
+<td>_mm_storeh_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_storeh_pi</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_storel_epi64</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_storel_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_storel_pi</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_storer_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_storer_ps</td>
+<td>__lsx_vshuf4i_w + __lsx_vst</td>
+</tr>
+<tr>
+<td>_mm_storeu_pd</td>
+<td>__lsx_vst</td>
+</tr>
+<tr>
+<td>_mm_storeu_ps</td>
+<td>__lsx_vst</td>
+</tr>
+<tr>
+<td>_mm_storeu_si128</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_storeu_si16</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_storeu_si32</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_storeu_si64</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_stream_load_si128</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_stream_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_stream_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_stream_si128</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_sub_epi16</td>
+<td>__lsx_vsub_h</td>
+</tr>
+<tr>
+<td>_mm_sub_epi32</td>
+<td>__lsx_vsub_w</td>
+</tr>
+<tr>
+<td>_mm_sub_epi64</td>
+<td>__lsx_vsub_d</td>
+</tr>
+<tr>
+<td>_mm_sub_epi8</td>
+<td>__lsx_vsub_b</td>
+</tr>
+<tr>
+<td>_mm_sub_pd</td>
+<td>__lsx_vfsub_s</td>
+</tr>
+<tr>
+<td>_mm_sub_ps</td>
+<td>__lsx_vfsub_s</td>
+</tr>
+<tr>
+<td>_mm_sub_sd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_sub_ss</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_subs_epi16</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_subs_epi8</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_subs_epu16</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_subs_epu8</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_test_all_ones</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_test_all_zeros</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_test_mix_ones_zeros</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_testc_si128</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_testnzc_si128</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_testz_si128</td>
+<td></td>
+</tr>
+<tr>
+<td>_MM_TRANSPOSE4_PS</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_ucomieq_sd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_ucomieq_ss</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_ucomige_sd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_ucomige_ss</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_ucomigt_sd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_ucomigt_ss</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_ucomile_sd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_ucomile_ss</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_ucomilt_sd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_ucomilt_ss</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_ucomineq_sd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_ucomineq_ss</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_undefined_pd</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_undefined_ps</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_undefined_si128</td>
+<td></td>
+</tr>
+<tr>
+<td>_mm_unpackhi_epi16</td>
+<td>__lsx_vilvh_h</td>
+</tr>
+<tr>
+<td>_mm_unpackhi_epi32</td>
+<td>__lsx_vilvh_w</td>
+</tr>
+<tr>
+<td>_mm_unpackhi_epi64</td>
+<td>__lsx_vilvh_d</td>
+</tr>
+<tr>
+<td>_mm_unpackhi_epi8</td>
+<td>__lsx_vilvh_b</td>
+</tr>
+<tr>
+<td>_mm_unpackhi_pd</td>
+<td>__lsx_vilvh_d</td>
+</tr>
+<tr>
+<td>_mm_unpackhi_ps</td>
+<td>__lsx_vilvh_w</td>
+</tr>
+<tr>
+<td>_mm_unpacklo_epi16</td>
+<td>__lsx_vilvl_h</td>
+</tr>
+<tr>
+<td>_mm_unpacklo_epi32</td>
+<td>__lsx_vilvl_w</td>
+</tr>
+<tr>
+<td>_mm_unpacklo_epi64</td>
+<td>__lsx_vilvl_d</td>
+</tr>
+<tr>
+<td>_mm_unpacklo_epi8</td>
+<td>__lsx_vilvl_b</td>
+</tr>
+<tr>
+<td>_mm_unpacklo_pd</td>
+<td>__lsx_vilvl_d</td>
+</tr>
+<tr>
+<td>_mm_unpacklo_ps</td>
+<td>__lsx_vilvl_w</td>
+</tr>
+<tr>
+<td>_mm_xor_pd</td>
+<td>__lsx_vxor_v</td>
+</tr>
+<tr>
+<td>_mm_xor_ps</td>
+<td>__lsx_vxor_v</td>
+</tr>
+<tr>
+<td>_mm_xor_si128</td>
+<td>__lsx_vxor_v</td>
+</tr>
+</tbody>
+</table>
+<p>The list of SSE intrinsics came from <a href="https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.htm">Intel Intrinsics Guide</a>.</p>
+              
+            </div>
+          </div><footer>
+    <div class="rst-footer-buttons" role="navigation" aria-label="Footer Navigation">
+        <a href="../migrating_avx/" class="btn btn-neutral float-left" title="Migrating from AVX to LASX"><span class="icon icon-circle-arrow-left"></span> Previous</a>
+        <a href="../viewer/" class="btn btn-neutral float-right" title="Browse All Intrinsics">Next <span class="icon icon-circle-arrow-right"></span></a>
+    </div>
+
+  <hr/>
+
+  <div role="contentinfo">
+    <!-- Copyright etc -->
+      <p>Copyright &copy; 2023 Jiajie Chen</p>
+  </div>
+
+  Built with <a href="https://www.mkdocs.org/">MkDocs</a> using a <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
+</footer>
+          
+        </div>
+      </div>
+
+    </section>
+
+  </div>
+
+  <div class="rst-versions" role="note" aria-label="Versions">
+  <span class="rst-current-version" data-toggle="rst-current-version">
+    
+    
+      <span><a href="../migrating_avx/" style="color: #fcfcfc">&laquo; Previous</a></span>
+    
+    
+      <span><a href="../viewer/" style="color: #fcfcfc">Next &raquo;</a></span>
+    
+  </span>
+</div>
+    <script src="../js/jquery-3.6.0.min.js"></script>
+    <script>var base_url = "..";</script>
+    <script src="../js/theme_extra.js"></script>
+    <script src="../js/theme.js"></script>
+    <script>
+        jQuery(function () {
+            SphinxRtdTheme.Navigation.enable(true);
+        });
+    </script>
+
+</body>
+</html>
diff --git a/sitemap.xml b/sitemap.xml
new file mode 100644
index 00000000..e226d815
--- /dev/null
+++ b/sitemap.xml
@@ -0,0 +1,178 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+    <url>
+         <loc>https://jia.je/unofficial-loongarch-intrinsics-guide/</loc>
+         <lastmod>2024-07-17</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://jia.je/unofficial-loongarch-intrinsics-guide/latency_throughput/</loc>
+         <lastmod>2024-07-17</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://jia.je/unofficial-loongarch-intrinsics-guide/migrating_avx/</loc>
+         <lastmod>2024-07-17</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://jia.je/unofficial-loongarch-intrinsics-guide/migrating_sse/</loc>
+         <lastmod>2024-07-17</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://jia.je/unofficial-loongarch-intrinsics-guide/viewer/</loc>
+         <lastmod>2024-07-17</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://jia.je/unofficial-loongarch-intrinsics-guide/lasx/bitwise_operations/</loc>
+         <lastmod>2024-07-17</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://jia.je/unofficial-loongarch-intrinsics-guide/lasx/branch/</loc>
+         <lastmod>2024-07-17</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://jia.je/unofficial-loongarch-intrinsics-guide/lasx/float_comparison/</loc>
+         <lastmod>2024-07-17</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://jia.je/unofficial-loongarch-intrinsics-guide/lasx/float_computation/</loc>
+         <lastmod>2024-07-17</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://jia.je/unofficial-loongarch-intrinsics-guide/lasx/float_conversion/</loc>
+         <lastmod>2024-07-17</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://jia.je/unofficial-loongarch-intrinsics-guide/lasx/float_misc/</loc>
+         <lastmod>2024-07-17</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://jia.je/unofficial-loongarch-intrinsics-guide/lasx/fma/</loc>
+         <lastmod>2024-07-17</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://jia.je/unofficial-loongarch-intrinsics-guide/lasx/integer_comparison/</loc>
+         <lastmod>2024-07-17</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://jia.je/unofficial-loongarch-intrinsics-guide/lasx/integer_computation/</loc>
+         <lastmod>2024-07-17</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://jia.je/unofficial-loongarch-intrinsics-guide/lasx/logical/</loc>
+         <lastmod>2024-07-17</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://jia.je/unofficial-loongarch-intrinsics-guide/lasx/memory/</loc>
+         <lastmod>2024-07-17</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://jia.je/unofficial-loongarch-intrinsics-guide/lasx/misc/</loc>
+         <lastmod>2024-07-17</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://jia.je/unofficial-loongarch-intrinsics-guide/lasx/permutation/</loc>
+         <lastmod>2024-07-17</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://jia.je/unofficial-loongarch-intrinsics-guide/lasx/shift/</loc>
+         <lastmod>2024-07-17</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://jia.je/unofficial-loongarch-intrinsics-guide/lasx/shuffling/</loc>
+         <lastmod>2024-07-17</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://jia.je/unofficial-loongarch-intrinsics-guide/lsx/bitwise_operations/</loc>
+         <lastmod>2024-07-17</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://jia.je/unofficial-loongarch-intrinsics-guide/lsx/branch/</loc>
+         <lastmod>2024-07-17</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://jia.je/unofficial-loongarch-intrinsics-guide/lsx/float_comparison/</loc>
+         <lastmod>2024-07-17</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://jia.je/unofficial-loongarch-intrinsics-guide/lsx/float_computation/</loc>
+         <lastmod>2024-07-17</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://jia.je/unofficial-loongarch-intrinsics-guide/lsx/float_conversion/</loc>
+         <lastmod>2024-07-17</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://jia.je/unofficial-loongarch-intrinsics-guide/lsx/float_misc/</loc>
+         <lastmod>2024-07-17</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://jia.je/unofficial-loongarch-intrinsics-guide/lsx/fma/</loc>
+         <lastmod>2024-07-17</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://jia.je/unofficial-loongarch-intrinsics-guide/lsx/integer_comparison/</loc>
+         <lastmod>2024-07-17</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://jia.je/unofficial-loongarch-intrinsics-guide/lsx/integer_computation/</loc>
+         <lastmod>2024-07-17</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://jia.je/unofficial-loongarch-intrinsics-guide/lsx/logical/</loc>
+         <lastmod>2024-07-17</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://jia.je/unofficial-loongarch-intrinsics-guide/lsx/memory/</loc>
+         <lastmod>2024-07-17</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://jia.je/unofficial-loongarch-intrinsics-guide/lsx/misc/</loc>
+         <lastmod>2024-07-17</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://jia.je/unofficial-loongarch-intrinsics-guide/lsx/permutation/</loc>
+         <lastmod>2024-07-17</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://jia.je/unofficial-loongarch-intrinsics-guide/lsx/shift/</loc>
+         <lastmod>2024-07-17</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+    <url>
+         <loc>https://jia.je/unofficial-loongarch-intrinsics-guide/lsx/shuffling/</loc>
+         <lastmod>2024-07-17</lastmod>
+         <changefreq>daily</changefreq>
+    </url>
+</urlset>
\ No newline at end of file
diff --git a/sitemap.xml.gz b/sitemap.xml.gz
new file mode 100644
index 00000000..21f13a89
Binary files /dev/null and b/sitemap.xml.gz differ
diff --git a/viewer/index.html b/viewer/index.html
new file mode 100644
index 00000000..a05393ea
--- /dev/null
+++ b/viewer/index.html
@@ -0,0 +1,341 @@
+<!DOCTYPE html>
+<html class="writer-html5" lang="en" >
+<head>
+    <meta charset="utf-8" />
+    <meta http-equiv="X-UA-Compatible" content="IE=edge" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" /><link rel="canonical" href="https://jia.je/unofficial-loongarch-intrinsics-guide/viewer/" />
+      <link rel="shortcut icon" href="../img/favicon.ico" />
+    <title>Browse All Intrinsics - Unofficial LoongArch Intrinsics Guide</title>
+    <link rel="stylesheet" href="../css/theme.css" />
+    <link rel="stylesheet" href="../css/theme_extra.css" />
+        <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.8.0/styles/github.min.css" />
+        <link href="../main.css" rel="stylesheet" />
+    
+      <script>
+        // Current page data
+        var mkdocs_page_name = "Browse All Intrinsics";
+        var mkdocs_page_input_path = "viewer.md";
+        var mkdocs_page_url = "/unofficial-loongarch-intrinsics-guide/viewer/";
+      </script>
+    
+    <!--[if lt IE 9]>
+      <script src="../js/html5shiv.min.js"></script>
+    <![endif]-->
+      <script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.8.0/highlight.min.js"></script>
+      <script>hljs.highlightAll();</script> 
+</head>
+
+<body class="wy-body-for-nav" role="document">
+
+  <div class="wy-grid-for-nav">
+    <nav data-toggle="wy-nav-shift" class="wy-nav-side stickynav">
+    <div class="wy-side-scroll">
+      <div class="wy-side-nav-search">
+          <a href=".." class="icon icon-home"> Unofficial LoongArch Intrinsics Guide
+        </a>
+      </div>
+
+      <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="..">Unofficial LoongArch Intrinsics Guide</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../latency_throughput/">Latency and Throughput of Instructions</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../migrating_avx/">Migrating from AVX to LASX</a>
+                </li>
+              </ul>
+              <ul>
+                <li class="toctree-l1"><a class="reference internal" href="../migrating_sse/">Migrating from SSE to LSX</a>
+                </li>
+              </ul>
+              <ul class="current">
+                <li class="toctree-l1 current"><a class="reference internal current" href="./">Browse All Intrinsics</a>
+    <ul class="current">
+    </ul>
+                </li>
+              </ul>
+              <p class="caption"><span class="caption-text">Lasx</span></p>
+              <ul>
+                  <li class="toctree-l1"><a class="reference internal" href="../lasx/bitwise_operations/">Bitwise Operations</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lasx/branch/">Branch</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lasx/float_comparison/">Floating Point Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lasx/float_computation/">Floating Point Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lasx/float_conversion/">Floating Point Conversion</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lasx/float_misc/">Floating Point Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lasx/fma/">Fused Multiply-Add</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lasx/integer_comparison/">Integer Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lasx/integer_computation/">Integer Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lasx/logical/">Logical</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lasx/memory/">Memory Load & Store</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lasx/misc/">Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lasx/permutation/">Permutation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lasx/shift/">Shift</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lasx/shuffling/">Shuffling</a>
+                  </li>
+              </ul>
+              <p class="caption"><span class="caption-text">Lsx</span></p>
+              <ul>
+                  <li class="toctree-l1"><a class="reference internal" href="../lsx/bitwise_operations/">Bitwise Operations</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lsx/branch/">Branch</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lsx/float_comparison/">Floating Point Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lsx/float_computation/">Floating Point Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lsx/float_conversion/">Floating Point Conversion</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lsx/float_misc/">Floating Point Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lsx/fma/">Fused Multiply-Add</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lsx/integer_comparison/">Integer Comparison</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lsx/integer_computation/">Integer Computation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lsx/logical/">Logical</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lsx/memory/">Memory Load & Store</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lsx/misc/">Misc</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lsx/permutation/">Permutation</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lsx/shift/">Shift</a>
+                  </li>
+                  <li class="toctree-l1"><a class="reference internal" href="../lsx/shuffling/">Shuffling</a>
+                  </li>
+              </ul>
+      </div>
+    </div>
+    </nav>
+
+    <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
+      <nav class="wy-nav-top" role="navigation" aria-label="Mobile navigation menu">
+          <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
+          <a href="..">Unofficial LoongArch Intrinsics Guide</a>
+        
+      </nav>
+      <div class="wy-nav-content">
+        <div class="rst-content"><div role="navigation" aria-label="breadcrumbs navigation">
+  <ul class="wy-breadcrumbs">
+    <li><a href=".." class="icon icon-home" aria-label="Docs"></a></li>
+      <li class="breadcrumb-item active">Browse All Intrinsics</li>
+    <li class="wy-breadcrumbs-aside">
+    </li>
+  </ul>
+  <hr/>
+</div>
+          <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
+            <div class="section" itemprop="articleBody">
+              
+                <h1 id="browse-all-intrinsics">Browse All Intrinsics</h1>
+<div id="app">
+
+<div v-if="false">
+  Loading... Please wait...
+  <p></p>
+</div>
+
+<div v-cloak>
+
+<b style="padding-top: 10px">Categories:</b>
+<p></p>
+
+<div v-for="group in allGroups">
+  <input type="checkbox" :id="group" :value="group" v-model="groups" />
+  <label :for="group" style="display: inline">{{ group }}</label>
+</div>
+
+<p></p>
+<b style="padding-top: 10px">Instruction Set Extensions:</b>
+<p></p>
+
+<div v-for="extension in allExtensions">
+  <input type="checkbox" :id="extension" :value="extension" v-model="extensions" />
+  <label :for="extension" style="display: inline">{{ extension }}</label>
+</div>
+
+<p></p>
+<b>Filter by content:</b>
+<p></p>
+
+<input v-model="search" placeholder="Search intrinsics using MiniSearch search syntax" style="width: 100%">
+
+<p></p>
+<p></p>
+
+Found {{intrinsics.length}} intrinsics.
+
+<p></p>
+<p></p>
+<p></p>
+<details v-for="intrinsic in intrinsics.inner" v-show="intrinsic.display" style="margin-top: 5px;padding-left: 5px;border-top: 1px solid black;border-left: 5px solid black">
+  <summary>{{ intrinsic.name }}</summary>
+  <div v-html="intrinsic.content" style="padding: 10px"></div>
+</details>
+
+<p></p>
+<p></p>
+
+</div>
+</div>
+
+<script type="module">
+  import { createApp, ref, computed, nextTick } from 'https://unpkg.com/vue@3/dist/vue.esm-browser.js';
+  import MiniSearch from 'https://cdn.jsdelivr.net/npm/minisearch@6.3.0/dist/es/index.js';
+
+  const allIntrinsics =  [{"name": "__m128 __lsx_vfadd_s (__m128 a, __m128 b)", "markdown": "### Synopsis\n\n```c++\n__m128 __lsx_vfadd_s (__m128 a, __m128 b)\n#include <lsxintrin.h>\nInstruction: vfadd.s vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nAdd single precision floating point elements in `a` to elements in `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.fp32[i] = a.fp32[i] + b.fp32[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 4 |\n| 3C5000 | 5 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128 __lsx_vfadd_s (__m128 a, __m128 b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfadd.s vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add single precision floating point elements in <code>a</code> to elements in <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.fp32[i] = a.fp32[i] + b.fp32[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>5</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Computation", "extension": "LSX", "display": true}, {"name": "__m128 __lsx_vfcvt_s_d (__m128d a, __m128d b)", "markdown": "### Synopsis\n\n```c++\n__m128 __lsx_vfcvt_s_d (__m128d a, __m128d b)\n#include <lsxintrin.h>\nInstruction: vfcvt.s.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nConvert double precision floating point elements in `a` and `b` to single precision.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (i < 2) {\n    dst.fp32[i] = b.fp64[i];\n  } else {\n    dst.fp32[i] = a.fp64[i - 2];\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128 __lsx_vfcvt_s_d (__m128d a, __m128d b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfcvt.s.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Convert double precision floating point elements in <code>a</code> and <code>b</code> to single precision.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (i &lt; 2) {\n    dst.fp32[i] = b.fp64[i];\n  } else {\n    dst.fp32[i] = a.fp64[i - 2];\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Conversion", "extension": "LSX", "display": true}, {"name": "__m128 __lsx_vfcvth_s_h (__m128i a)", "markdown": "### Synopsis\n\n```c++\n__m128 __lsx_vfcvth_s_h (__m128i a)\n#include <lsxintrin.h>\nInstruction: vfcvth.s.h vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nConvert half precision floating point elements in higher half of `a` to single precision.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.fp32[i] = a.fp16[4 + i];\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128 __lsx_vfcvth_s_h (__m128i a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfcvth.s.h vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Convert half precision floating point elements in higher half of <code>a</code> to single precision.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.fp32[i] = a.fp16[4 + i];\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Conversion", "extension": "LSX", "display": true}, {"name": "__m128 __lsx_vfcvtl_s_h (__m128i a)", "markdown": "### Synopsis\n\n```c++\n__m128 __lsx_vfcvtl_s_h (__m128i a)\n#include <lsxintrin.h>\nInstruction: vfcvtl.s.h vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nConvert half precision floating point elements in lower half of `a` to single precision.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.fp32[i] = a.fp16[i];\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128 __lsx_vfcvtl_s_h (__m128i a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfcvtl.s.h vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Convert half precision floating point elements in lower half of <code>a</code> to single precision.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.fp32[i] = a.fp16[i];\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Conversion", "extension": "LSX", "display": true}, {"name": "__m128 __lsx_vfdiv_s (__m128 a, __m128 b)", "markdown": "### Synopsis\n\n```c++\n__m128 __lsx_vfdiv_s (__m128 a, __m128 b)\n#include <lsxintrin.h>\nInstruction: vfdiv.s vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nDivide single precision floating point elements in `a` by elements in `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.fp32[i] = a.fp32[i] / b.fp32[i];\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 11 | 0.18(1/5.5) |\n| 3C5000 | 11, 19.5 | 0.13(1/7.5) |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128 __lsx_vfdiv_s (__m128 a, __m128 b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfdiv.s vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Divide single precision floating point elements in <code>a</code> by elements in <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.fp32[i] = a.fp32[i] / b.fp32[i];\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>11</td>\n<td>0.18(1/5.5)</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>11, 19.5</td>\n<td>0.13(1/7.5)</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Computation", "extension": "LSX", "display": true}, {"name": "__m128 __lsx_vffint_s_l (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128 __lsx_vffint_s_l (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vffint.s.l vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nConvert 64-bit integer elements in `a` and `b` to single-precision floating point numbers.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.fp32[i] =\n      (i < 2) ? (f32)(s32)a.dword[i]\n              : (f32)(s32)b.dword[i]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 5 | 2 |\n| 3C5000 | 5 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128 __lsx_vffint_s_l (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vffint.s.l vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Convert 64-bit integer elements in <code>a</code> and <code>b</code> to single-precision floating point numbers.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.fp32[i] =\n      (i &lt; 2) ? (f32)(s32)a.dword[i]\n              : (f32)(s32)b.dword[i]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>5</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>5</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Conversion", "extension": "LSX", "display": true}, {"name": "__m128 __lsx_vffint_s_w (__m128i a)", "markdown": "### Synopsis\n\n```c++\n__m128 __lsx_vffint_s_w (__m128i a)\n#include <lsxintrin.h>\nInstruction: vffint.s.w vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nConvert signed 32-bit integer elements in `a` to single-precision floating point numbers.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.fp32[i] = (f32)(s32)a.word[i]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 4 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128 __lsx_vffint_s_w (__m128i a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vffint.s.w vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Convert signed 32-bit integer elements in <code>a</code> to single-precision floating point numbers.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.fp32[i] = (f32)(s32)a.word[i]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Conversion", "extension": "LSX", "display": true}, {"name": "__m128 __lsx_vffint_s_wu (__m128i a)", "markdown": "### Synopsis\n\n```c++\n__m128 __lsx_vffint_s_wu (__m128i a)\n#include <lsxintrin.h>\nInstruction: vffint.s.wu vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nConvert unsigned 32-bit integer elements in `a` to single-precision floating point numbers.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.fp32[i] = (f32)(u32)a.word[i]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 4 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128 __lsx_vffint_s_wu (__m128i a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vffint.s.wu vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Convert unsigned 32-bit integer elements in <code>a</code> to single-precision floating point numbers.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.fp32[i] = (f32)(u32)a.word[i]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Conversion", "extension": "LSX", "display": true}, {"name": "__m128 __lsx_vflogb_s (__m128 a)", "markdown": "### Synopsis\n\n```c++\n__m128 __lsx_vflogb_s (__m128 a)\n#include <lsxintrin.h>\nInstruction: vflogb.s vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompute 2-based logarithm of single precision floating point elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.fp32[i] = log2(a.fp32[i]);\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 4 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128 __lsx_vflogb_s (__m128 a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vflogb.s vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute 2-based logarithm of single precision floating point elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.fp32[i] = log2(a.fp32[i]);\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Computation", "extension": "LSX", "display": true}, {"name": "__m128 __lsx_vfmadd_s (__m128 a, __m128 b, __m128 c)", "markdown": "### Synopsis\n\n```c++\n__m128 __lsx_vfmadd_s (__m128 a, __m128 b, __m128 c)\n#include <lsxintrin.h>\nInstruction: vfmadd.s vr, vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompute packed single precision floating point FMA(Fused Multiply-Add): multiply elements in `a` and `b`, accumulate to elements in `c` and store the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.fp32[i] = a.fp32[i] * b.fp32[i] + c.fp32[i];\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 5 | 2 |\n| 3C5000 | 5 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128 __lsx_vfmadd_s (__m128 a, __m128 b, __m128 c)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfmadd.s vr, vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute packed single precision floating point FMA(Fused Multiply-Add): multiply elements in <code>a</code> and <code>b</code>, accumulate to elements in <code>c</code> and store the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.fp32[i] = a.fp32[i] * b.fp32[i] + c.fp32[i];\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>5</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>5</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Fused Multiply-Add", "extension": "LSX", "display": true}, {"name": "__m128 __lsx_vfmax_s (__m128 a, __m128 b)", "markdown": "### Synopsis\n\n```c++\n__m128 __lsx_vfmax_s (__m128 a, __m128 b)\n#include <lsxintrin.h>\nInstruction: vfmax.s vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompute maximum of single precision floating point elements in `a` and `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.fp32[i] = fmax(a.fp32[i], b.fp32[i]);\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128 __lsx_vfmax_s (__m128 a, __m128 b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfmax.s vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute maximum of single precision floating point elements in <code>a</code> and <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.fp32[i] = fmax(a.fp32[i], b.fp32[i]);\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Computation", "extension": "LSX", "display": true}, {"name": "__m128 __lsx_vfmaxa_s (__m128 a, __m128 b)", "markdown": "### Synopsis\n\n```c++\n__m128 __lsx_vfmaxa_s (__m128 a, __m128 b)\n#include <lsxintrin.h>\nInstruction: vfmaxa.s vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompute maximum of single precision floating point elements in `a` and `b` by magnitude.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.fp32[i] = (abs(a.fp32[i]) > abs(b.fp32[i])) ? a.fp32[i] : b.fp32[i];\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128 __lsx_vfmaxa_s (__m128 a, __m128 b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfmaxa.s vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute maximum of single precision floating point elements in <code>a</code> and <code>b</code> by magnitude.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.fp32[i] = (abs(a.fp32[i]) &gt; abs(b.fp32[i])) ? a.fp32[i] : b.fp32[i];\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Computation", "extension": "LSX", "display": true}, {"name": "__m128 __lsx_vfmin_s (__m128 a, __m128 b)", "markdown": "### Synopsis\n\n```c++\n__m128 __lsx_vfmin_s (__m128 a, __m128 b)\n#include <lsxintrin.h>\nInstruction: vfmax.s vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompute minimum of single precision floating point elements in `a` and `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.fp32[i] = fmin(a.fp32[i], b.fp32[i]);\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128 __lsx_vfmin_s (__m128 a, __m128 b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfmax.s vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute minimum of single precision floating point elements in <code>a</code> and <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.fp32[i] = fmin(a.fp32[i], b.fp32[i]);\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Computation", "extension": "LSX", "display": true}, {"name": "__m128 __lsx_vfmina_s (__m128 a, __m128 b)", "markdown": "### Synopsis\n\n```c++\n__m128 __lsx_vfmina_s (__m128 a, __m128 b)\n#include <lsxintrin.h>\nInstruction: vfmina.s vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompute minimum of single precision floating point elements in `a` and `b` by magnitude.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.fp32[i] = (abs(a.fp32[i]) < abs(b.fp32[i])) ? a.fp32[i] : b.fp32[i];\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128 __lsx_vfmina_s (__m128 a, __m128 b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfmina.s vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute minimum of single precision floating point elements in <code>a</code> and <code>b</code> by magnitude.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.fp32[i] = (abs(a.fp32[i]) &lt; abs(b.fp32[i])) ? a.fp32[i] : b.fp32[i];\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Computation", "extension": "LSX", "display": true}, {"name": "__m128 __lsx_vfmsub_s (__m128 a, __m128 b, __m128 c)", "markdown": "### Synopsis\n\n```c++\n__m128 __lsx_vfmsub_s (__m128 a, __m128 b, __m128 c)\n#include <lsxintrin.h>\nInstruction: vfmsub.s vr, vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompute packed single precision floating point FMA(Fused Multiply-Add): multiply elements in `a` and `b`, subtract elements in `c` and store the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.fp32[i] = a.fp32[i] * b.fp32[i] - c.fp32[i];\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 5 | 2 |\n| 3C5000 | 5 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128 __lsx_vfmsub_s (__m128 a, __m128 b, __m128 c)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfmsub.s vr, vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute packed single precision floating point FMA(Fused Multiply-Add): multiply elements in <code>a</code> and <code>b</code>, subtract elements in <code>c</code> and store the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.fp32[i] = a.fp32[i] * b.fp32[i] - c.fp32[i];\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>5</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>5</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Fused Multiply-Add", "extension": "LSX", "display": true}, {"name": "__m128 __lsx_vfmul_s (__m128 a, __m128 b)", "markdown": "### Synopsis\n\n```c++\n__m128 __lsx_vfmul_s (__m128 a, __m128 b)\n#include <lsxintrin.h>\nInstruction: vfmul.s vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nMultiply single precision floating point elements in `a` and elements in `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.fp32[i] = a.fp32[i] * b.fp32[i];\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 5 | 2 |\n| 3C5000 | 5 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128 __lsx_vfmul_s (__m128 a, __m128 b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfmul.s vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply single precision floating point elements in <code>a</code> and elements in <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.fp32[i] = a.fp32[i] * b.fp32[i];\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>5</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>5</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Computation", "extension": "LSX", "display": true}, {"name": "__m128 __lsx_vfnmadd_s (__m128 a, __m128 b, __m128 c)", "markdown": "### Synopsis\n\n```c++\n__m128 __lsx_vfnmadd_s (__m128 a, __m128 b, __m128 c)\n#include <lsxintrin.h>\nInstruction: vfnmadd.s vr, vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompute packed single precision floating point FMA(Fused Multiply-Add): multiply elements in `a` and `b`, accumulate to elements in `c` and store the negated result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.fp32[i] = -(a.fp32[i] * b.fp32[i] + c.fp32[i]);\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 5 | 2 |\n| 3C5000 | 5 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128 __lsx_vfnmadd_s (__m128 a, __m128 b, __m128 c)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfnmadd.s vr, vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute packed single precision floating point FMA(Fused Multiply-Add): multiply elements in <code>a</code> and <code>b</code>, accumulate to elements in <code>c</code> and store the negated result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.fp32[i] = -(a.fp32[i] * b.fp32[i] + c.fp32[i]);\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>5</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>5</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Fused Multiply-Add", "extension": "LSX", "display": true}, {"name": "__m128 __lsx_vfnmsub_s (__m128 a, __m128 b, __m128 c)", "markdown": "### Synopsis\n\n```c++\n__m128 __lsx_vfnmsub_s (__m128 a, __m128 b, __m128 c)\n#include <lsxintrin.h>\nInstruction: vfnmsub.s vr, vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompute packed single precision floating point FMA(Fused Multiply-Add): multiply elements in `a` and `b`, subtract elements in `c` and store the negated result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.fp32[i] = -(a.fp32[i] * b.fp32[i] - c.fp32[i]);\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 5 | 2 |\n| 3C5000 | 5 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128 __lsx_vfnmsub_s (__m128 a, __m128 b, __m128 c)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfnmsub.s vr, vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute packed single precision floating point FMA(Fused Multiply-Add): multiply elements in <code>a</code> and <code>b</code>, subtract elements in <code>c</code> and store the negated result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.fp32[i] = -(a.fp32[i] * b.fp32[i] - c.fp32[i]);\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>5</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>5</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Fused Multiply-Add", "extension": "LSX", "display": true}, {"name": "__m128 __lsx_vfrecip_s (__m128 a)", "markdown": "### Synopsis\n\n```c++\n__m128 __lsx_vfrecip_s (__m128 a)\n#include <lsxintrin.h>\nInstruction: vfrecip.s vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompute reciprocal of single precision floating point elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.fp32[i] = 1 / a.fp32[i];\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 11 | 0.18(1/5.5) |\n| 3C5000 | 27 | 0.14(1/7) |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128 __lsx_vfrecip_s (__m128 a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfrecip.s vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute reciprocal of single precision floating point elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.fp32[i] = 1 / a.fp32[i];\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>11</td>\n<td>0.18(1/5.5)</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>27</td>\n<td>0.14(1/7)</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Computation", "extension": "LSX", "display": true}, {"name": "__m128 __lsx_vfrecipe_s (__m128 a)", "markdown": "### Synopsis\n\n```c++\n__m128 __lsx_vfrecipe_s (__m128 a)\n#include <lsxintrin.h>\nInstruction: vfrecipe.s vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompute estimated reciprocal of single precision floating point elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.fp32[i] = 1 / a.fp32[i]; // estimated\n}\n```", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128 __lsx_vfrecipe_s (__m128 a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfrecipe.s vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute estimated reciprocal of single precision floating point elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.fp32[i] = 1 / a.fp32[i]; // estimated\n}\n</code></pre>", "group": "Floating Point Computation", "extension": "LSX", "display": true}, {"name": "__m128 __lsx_vfrint_s (__m128 a)", "markdown": "### Synopsis\n\n```c++\n__m128 __lsx_vfrint_s (__m128 a)\n#include <lsxintrin.h>\nInstruction: vfrint.s vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nRound single-precision floating point elements in `a` to integers, using current rounding mode specified in `fscr`, and store as floating point numbers.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.fp32[i] = (fp32)(s32)a.fp32[i]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128 __lsx_vfrint_s (__m128 a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfrint.s vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Round single-precision floating point elements in <code>a</code> to integers, using current rounding mode specified in <code>fscr</code>, and store as floating point numbers.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.fp32[i] = (fp32)(s32)a.fp32[i]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Misc", "extension": "LSX", "display": true}, {"name": "__m128 __lsx_vfrintrm_s (__m128 a)", "markdown": "### Synopsis\n\n```c++\n__m128 __lsx_vfrintrm_s (__m128 a)\n#include <lsxintrin.h>\nInstruction: vfrintrm.s vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nRound single-precision floating point elements in `a` to integers, rounding towards negative infinity, and store as floating point numbers.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.fp32[i] = (fp32)(s32)a.fp32[i]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128 __lsx_vfrintrm_s (__m128 a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfrintrm.s vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Round single-precision floating point elements in <code>a</code> to integers, rounding towards negative infinity, and store as floating point numbers.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.fp32[i] = (fp32)(s32)a.fp32[i]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Misc", "extension": "LSX", "display": true}, {"name": "__m128 __lsx_vfrintrne_s (__m128 a)", "markdown": "### Synopsis\n\n```c++\n__m128 __lsx_vfrintrne_s (__m128 a)\n#include <lsxintrin.h>\nInstruction: vfrintrne.s vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nRound single-precision floating point elements in `a` to integers, rounding towards nearest even, and store as floating point numbers.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.fp32[i] = (fp32)(s32)a.fp32[i]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128 __lsx_vfrintrne_s (__m128 a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfrintrne.s vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Round single-precision floating point elements in <code>a</code> to integers, rounding towards nearest even, and store as floating point numbers.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.fp32[i] = (fp32)(s32)a.fp32[i]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Misc", "extension": "LSX", "display": true}, {"name": "__m128 __lsx_vfrintrp_s (__m128 a)", "markdown": "### Synopsis\n\n```c++\n__m128 __lsx_vfrintrp_s (__m128 a)\n#include <lsxintrin.h>\nInstruction: vfrintrp.s vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nRound single-precision floating point elements in `a` to integers, rounding towards positive infinity, and store as floating point numbers.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.fp32[i] = (fp32)(s32)a.fp32[i]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128 __lsx_vfrintrp_s (__m128 a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfrintrp.s vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Round single-precision floating point elements in <code>a</code> to integers, rounding towards positive infinity, and store as floating point numbers.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.fp32[i] = (fp32)(s32)a.fp32[i]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Misc", "extension": "LSX", "display": true}, {"name": "__m128 __lsx_vfrintrz_s (__m128 a)", "markdown": "### Synopsis\n\n```c++\n__m128 __lsx_vfrintrz_s (__m128 a)\n#include <lsxintrin.h>\nInstruction: vfrintrz.s vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nRound single-precision floating point elements in `a` to integers, rounding towards zero, and store as floating point numbers.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.fp32[i] = (fp32)(s32)a.fp32[i]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128 __lsx_vfrintrz_s (__m128 a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfrintrz.s vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Round single-precision floating point elements in <code>a</code> to integers, rounding towards zero, and store as floating point numbers.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.fp32[i] = (fp32)(s32)a.fp32[i]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Misc", "extension": "LSX", "display": true}, {"name": "__m128 __lsx_vfrsqrt_s (__m128 a)", "markdown": "### Synopsis\n\n```c++\n__m128 __lsx_vfrsqrt_s (__m128 a)\n#include <lsxintrin.h>\nInstruction: vfrsqrt.s vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompute reciprocal of square root of single precision floating point elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.fp32[i] = 1.0 / sqrt(a.fp32[i]);\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 17 | 0.05(1/19) |\n| 3C5000 | 21 | 0.11(1/9) |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128 __lsx_vfrsqrt_s (__m128 a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfrsqrt.s vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute reciprocal of square root of single precision floating point elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.fp32[i] = 1.0 / sqrt(a.fp32[i]);\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>17</td>\n<td>0.05(1/19)</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>21</td>\n<td>0.11(1/9)</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Computation", "extension": "LSX", "display": true}, {"name": "__m128 __lsx_vfrsqrte_s (__m128 a)", "markdown": "### Synopsis\n\n```c++\n__m128 __lsx_vfrsqrte_s (__m128 a)\n#include <lsxintrin.h>\nInstruction: vfrsqrte.s vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompute estimated reciprocal of square root of single precision floating point elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.fp32[i] = 1.0 / sqrt(a.fp32[i]); // estimated\n}\n```", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128 __lsx_vfrsqrte_s (__m128 a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfrsqrte.s vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute estimated reciprocal of square root of single precision floating point elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.fp32[i] = 1.0 / sqrt(a.fp32[i]); // estimated\n}\n</code></pre>", "group": "Floating Point Computation", "extension": "LSX", "display": true}, {"name": "__m128 __lsx_vfsqrt_s (__m128 a)", "markdown": "### Synopsis\n\n```c++\n__m128 __lsx_vfsqrt_s (__m128 a)\n#include <lsxintrin.h>\nInstruction: vfsqrt.s vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompute square root of single precision floating point elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.fp32[i] = sqrt(a.fp32[i]);\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 11 | 0.08(1/12) |\n| 3C5000 | 27 | 0.17(1/6) |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128 __lsx_vfsqrt_s (__m128 a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfsqrt.s vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute square root of single precision floating point elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.fp32[i] = sqrt(a.fp32[i]);\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>11</td>\n<td>0.08(1/12)</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>27</td>\n<td>0.17(1/6)</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Computation", "extension": "LSX", "display": true}, {"name": "__m128 __lsx_vfsub_s (__m128 a, __m128 b)", "markdown": "### Synopsis\n\n```c++\n__m128 __lsx_vfsub_s (__m128 a, __m128 b)\n#include <lsxintrin.h>\nInstruction: vfsub.s vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nSubtract single precision floating point elements in `a` by elements in `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.fp32[i] = a.fp32[i] - b.fp32[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 4 |\n| 3C5000 | 5 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128 __lsx_vfsub_s (__m128 a, __m128 b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfsub.s vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Subtract single precision floating point elements in <code>a</code> by elements in <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.fp32[i] = a.fp32[i] - b.fp32[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>5</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Computation", "extension": "LSX", "display": true}, {"name": "__m128d __lsx_vfadd_d (__m128d a, __m128d b)", "markdown": "### Synopsis\n\n```c++\n__m128d __lsx_vfadd_d (__m128d a, __m128d b)\n#include <lsxintrin.h>\nInstruction: vfadd.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nAdd double precision floating point elements in `a` to elements in `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.fp64[i] = a.fp64[i] + b.fp64[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 4 |\n| 3C5000 | 5 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128d __lsx_vfadd_d (__m128d a, __m128d b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfadd.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add double precision floating point elements in <code>a</code> to elements in <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.fp64[i] = a.fp64[i] + b.fp64[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>5</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Computation", "extension": "LSX", "display": true}, {"name": "__m128d __lsx_vfcvth_d_s (__m128 a)", "markdown": "### Synopsis\n\n```c++\n__m128d __lsx_vfcvth_d_s (__m128 a)\n#include <lsxintrin.h>\nInstruction: vfcvth.d.s vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nConvert single precision floating point elements in higher half of `a` to double precision.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.fp64[i] = a.fp32[2 + i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128d __lsx_vfcvth_d_s (__m128 a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfcvth.d.s vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Convert single precision floating point elements in higher half of <code>a</code> to double precision.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.fp64[i] = a.fp32[2 + i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Conversion", "extension": "LSX", "display": true}, {"name": "__m128d __lsx_vfcvtl_d_s (__m128 a)", "markdown": "### Synopsis\n\n```c++\n__m128d __lsx_vfcvtl_d_s (__m128 a)\n#include <lsxintrin.h>\nInstruction: vfcvtl.d.s vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nConvert single precision floating point elements in lower half of `a` to double precision.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.fp64[i] = a.fp32[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128d __lsx_vfcvtl_d_s (__m128 a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfcvtl.d.s vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Convert single precision floating point elements in lower half of <code>a</code> to double precision.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.fp64[i] = a.fp32[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Conversion", "extension": "LSX", "display": true}, {"name": "__m128d __lsx_vfdiv_d (__m128d a, __m128d b)", "markdown": "### Synopsis\n\n```c++\n__m128d __lsx_vfdiv_d (__m128d a, __m128d b)\n#include <lsxintrin.h>\nInstruction: vfdiv.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nDivide double precision floating point elements in `a` by elements in `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.fp64[i] = a.fp64[i] / b.fp64[i];\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 8, 21.5 | 0.25(1/4) |\n| 3C5000 | 8, 16.5 | 0.08(1/12.5) |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128d __lsx_vfdiv_d (__m128d a, __m128d b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfdiv.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Divide double precision floating point elements in <code>a</code> by elements in <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.fp64[i] = a.fp64[i] / b.fp64[i];\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>8, 21.5</td>\n<td>0.25(1/4)</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>8, 16.5</td>\n<td>0.08(1/12.5)</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Computation", "extension": "LSX", "display": true}, {"name": "__m128d __lsx_vffint_d_l (__m128i a)", "markdown": "### Synopsis\n\n```c++\n__m128d __lsx_vffint_d_l (__m128i a)\n#include <lsxintrin.h>\nInstruction: vffint.d.l vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nConvert signed 64-bit integer elements in `a` to double-precision floating point numbers.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.fp64[i] = (f64)(s64)a.dword[i]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 4 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128d __lsx_vffint_d_l (__m128i a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vffint.d.l vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Convert signed 64-bit integer elements in <code>a</code> to double-precision floating point numbers.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.fp64[i] = (f64)(s64)a.dword[i]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Conversion", "extension": "LSX", "display": true}, {"name": "__m128d __lsx_vffint_d_lu (__m128i a)", "markdown": "### Synopsis\n\n```c++\n__m128d __lsx_vffint_d_lu (__m128i a)\n#include <lsxintrin.h>\nInstruction: vffint.d.lu vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nConvert unsigned 64-bit integer elements in `a` to double-precision floating point numbers.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.fp64[i] = (f64)(u64)a.dword[i]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 4 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128d __lsx_vffint_d_lu (__m128i a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vffint.d.lu vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Convert unsigned 64-bit integer elements in <code>a</code> to double-precision floating point numbers.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.fp64[i] = (f64)(u64)a.dword[i]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Conversion", "extension": "LSX", "display": true}, {"name": "__m128d __lsx_vffinth_d_w (__m128i a)", "markdown": "### Synopsis\n\n```c++\n__m128d __lsx_vffinth_d_w (__m128i a)\n#include <lsxintrin.h>\nInstruction: vffinth.d.w vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nConvert 32-bit integer elements in higher part of `a` to double precision floating point numbers.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.fp64[i] = (f64)(s32)a.word[i + 2]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 5 | 2 |\n| 3C5000 | 5 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128d __lsx_vffinth_d_w (__m128i a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vffinth.d.w vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Convert 32-bit integer elements in higher part of <code>a</code> to double precision floating point numbers.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.fp64[i] = (f64)(s32)a.word[i + 2]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>5</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>5</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Conversion", "extension": "LSX", "display": true}, {"name": "__m128d __lsx_vffintl_d_w (__m128i a)", "markdown": "### Synopsis\n\n```c++\n__m128d __lsx_vffintl_d_w (__m128i a)\n#include <lsxintrin.h>\nInstruction: vffintl.d.w vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nConvert 32-bit integer elements in lower part of `a` to double precision floating point numbers.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.fp64[i] = (f64)(s32)a.word[i]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 5 | 2 |\n| 3C5000 | 5 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128d __lsx_vffintl_d_w (__m128i a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vffintl.d.w vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Convert 32-bit integer elements in lower part of <code>a</code> to double precision floating point numbers.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.fp64[i] = (f64)(s32)a.word[i]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>5</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>5</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Conversion", "extension": "LSX", "display": true}, {"name": "__m128d __lsx_vflogb_d (__m128d a)", "markdown": "### Synopsis\n\n```c++\n__m128d __lsx_vflogb_d (__m128d a)\n#include <lsxintrin.h>\nInstruction: vflogb.d vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompute 2-based logarithm of double precision floating point elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.fp64[i] = log2(a.fp64[i]);\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 4 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128d __lsx_vflogb_d (__m128d a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vflogb.d vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute 2-based logarithm of double precision floating point elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.fp64[i] = log2(a.fp64[i]);\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Computation", "extension": "LSX", "display": true}, {"name": "__m128d __lsx_vfmadd_d (__m128d a, __m128d b, __m128d c)", "markdown": "### Synopsis\n\n```c++\n__m128d __lsx_vfmadd_d (__m128d a, __m128d b, __m128d c)\n#include <lsxintrin.h>\nInstruction: vfmadd.d vr, vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompute packed double precision floating point FMA(Fused Multiply-Add): multiply elements in `a` and `b`, accumulate to elements in `c` and store the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.fp64[i] = a.fp64[i] * b.fp64[i] + c.fp64[i];\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 5 | 2 |\n| 3C5000 | 5 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128d __lsx_vfmadd_d (__m128d a, __m128d b, __m128d c)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfmadd.d vr, vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute packed double precision floating point FMA(Fused Multiply-Add): multiply elements in <code>a</code> and <code>b</code>, accumulate to elements in <code>c</code> and store the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.fp64[i] = a.fp64[i] * b.fp64[i] + c.fp64[i];\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>5</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>5</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Fused Multiply-Add", "extension": "LSX", "display": true}, {"name": "__m128d __lsx_vfmax_d (__m128d a, __m128d b)", "markdown": "### Synopsis\n\n```c++\n__m128d __lsx_vfmax_d (__m128d a, __m128d b)\n#include <lsxintrin.h>\nInstruction: vfmax.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompute maximum of double precision floating point elements in `a` and `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.fp64[i] = fmax(a.fp64[i], b.fp64[i]);\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128d __lsx_vfmax_d (__m128d a, __m128d b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfmax.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute maximum of double precision floating point elements in <code>a</code> and <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.fp64[i] = fmax(a.fp64[i], b.fp64[i]);\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Computation", "extension": "LSX", "display": true}, {"name": "__m128d __lsx_vfmaxa_d (__m128d a, __m128d b)", "markdown": "### Synopsis\n\n```c++\n__m128d __lsx_vfmaxa_d (__m128d a, __m128d b)\n#include <lsxintrin.h>\nInstruction: vfmaxa.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompute maximum of double precision floating point elements in `a` and `b` by magnitude.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.fp64[i] = (abs(a.fp64[i]) > abs(b.fp64[i])) ? a.fp64[i] : b.fp64[i];\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128d __lsx_vfmaxa_d (__m128d a, __m128d b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfmaxa.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute maximum of double precision floating point elements in <code>a</code> and <code>b</code> by magnitude.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.fp64[i] = (abs(a.fp64[i]) &gt; abs(b.fp64[i])) ? a.fp64[i] : b.fp64[i];\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Computation", "extension": "LSX", "display": true}, {"name": "__m128d __lsx_vfmin_d (__m128d a, __m128d b)", "markdown": "### Synopsis\n\n```c++\n__m128d __lsx_vfmin_d (__m128d a, __m128d b)\n#include <lsxintrin.h>\nInstruction: vfmax.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompute minimum of double precision floating point elements in `a` and `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.fp64[i] = fmin(a.fp64[i], b.fp64[i]);\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128d __lsx_vfmin_d (__m128d a, __m128d b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfmax.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute minimum of double precision floating point elements in <code>a</code> and <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.fp64[i] = fmin(a.fp64[i], b.fp64[i]);\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Computation", "extension": "LSX", "display": true}, {"name": "__m128d __lsx_vfmina_d (__m128d a, __m128d b)", "markdown": "### Synopsis\n\n```c++\n__m128d __lsx_vfmina_d (__m128d a, __m128d b)\n#include <lsxintrin.h>\nInstruction: vfmina.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompute minimum of double precision floating point elements in `a` and `b` by magnitude.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.fp64[i] = (abs(a.fp64[i]) < abs(b.fp64[i])) ? a.fp64[i] : b.fp64[i];\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128d __lsx_vfmina_d (__m128d a, __m128d b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfmina.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute minimum of double precision floating point elements in <code>a</code> and <code>b</code> by magnitude.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.fp64[i] = (abs(a.fp64[i]) &lt; abs(b.fp64[i])) ? a.fp64[i] : b.fp64[i];\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Computation", "extension": "LSX", "display": true}, {"name": "__m128d __lsx_vfmsub_d (__m128d a, __m128d b, __m128d c)", "markdown": "### Synopsis\n\n```c++\n__m128d __lsx_vfmsub_d (__m128d a, __m128d b, __m128d c)\n#include <lsxintrin.h>\nInstruction: vfmsub.d vr, vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompute packed double precision floating point FMA(Fused Multiply-Add): multiply elements in `a` and `b`, subtract elements in `c` and store the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.fp64[i] = a.fp64[i] * b.fp64[i] - c.fp64[i];\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 5 | 2 |\n| 3C5000 | 5 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128d __lsx_vfmsub_d (__m128d a, __m128d b, __m128d c)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfmsub.d vr, vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute packed double precision floating point FMA(Fused Multiply-Add): multiply elements in <code>a</code> and <code>b</code>, subtract elements in <code>c</code> and store the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.fp64[i] = a.fp64[i] * b.fp64[i] - c.fp64[i];\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>5</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>5</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Fused Multiply-Add", "extension": "LSX", "display": true}, {"name": "__m128d __lsx_vfmul_d (__m128d a, __m128d b)", "markdown": "### Synopsis\n\n```c++\n__m128d __lsx_vfmul_d (__m128d a, __m128d b)\n#include <lsxintrin.h>\nInstruction: vfmul.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nMultiply double precision floating point elements in `a` and elements in `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.fp64[i] = a.fp64[i] * b.fp64[i];\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 5 | 2 |\n| 3C5000 | 5 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128d __lsx_vfmul_d (__m128d a, __m128d b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfmul.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply double precision floating point elements in <code>a</code> and elements in <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.fp64[i] = a.fp64[i] * b.fp64[i];\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>5</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>5</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Computation", "extension": "LSX", "display": true}, {"name": "__m128d __lsx_vfnmadd_d (__m128d a, __m128d b, __m128d c)", "markdown": "### Synopsis\n\n```c++\n__m128d __lsx_vfnmadd_d (__m128d a, __m128d b, __m128d c)\n#include <lsxintrin.h>\nInstruction: vfnmadd.d vr, vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompute packed double precision floating point FMA(Fused Multiply-Add): multiply elements in `a` and `b`, accumulate to elements in `c` and store the negated result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.fp64[i] = (a.fp64[i] * b.fp64[i] + c.fp64[i]);\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 5 | 2 |\n| 3C5000 | 5 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128d __lsx_vfnmadd_d (__m128d a, __m128d b, __m128d c)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfnmadd.d vr, vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute packed double precision floating point FMA(Fused Multiply-Add): multiply elements in <code>a</code> and <code>b</code>, accumulate to elements in <code>c</code> and store the negated result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.fp64[i] = (a.fp64[i] * b.fp64[i] + c.fp64[i]);\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>5</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>5</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Fused Multiply-Add", "extension": "LSX", "display": true}, {"name": "__m128d __lsx_vfnmsub_d (__m128d a, __m128d b, __m128d c)", "markdown": "### Synopsis\n\n```c++\n__m128d __lsx_vfnmsub_d (__m128d a, __m128d b, __m128d c)\n#include <lsxintrin.h>\nInstruction: vfnmsub.d vr, vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompute packed double precision floating point FMA(Fused Multiply-Add): multiply elements in `a` and `b`, subtract elements in `c` and store the negated result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.fp64[i] = -(a.fp64[i] * b.fp64[i] - c.fp64[i]);\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 5 | 2 |\n| 3C5000 | 5 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128d __lsx_vfnmsub_d (__m128d a, __m128d b, __m128d c)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfnmsub.d vr, vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute packed double precision floating point FMA(Fused Multiply-Add): multiply elements in <code>a</code> and <code>b</code>, subtract elements in <code>c</code> and store the negated result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.fp64[i] = -(a.fp64[i] * b.fp64[i] - c.fp64[i]);\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>5</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>5</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Fused Multiply-Add", "extension": "LSX", "display": true}, {"name": "__m128d __lsx_vfrecip_d (__m128d a)", "markdown": "### Synopsis\n\n```c++\n__m128d __lsx_vfrecip_d (__m128d a)\n#include <lsxintrin.h>\nInstruction: vfrecip.d vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompute reciprocal of double precision floating point elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.fp64[i] = 1 / a.fp64[i];\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 8 | 0.25(1/4) |\n| 3C5000 | 23 | 0.08(1/12) |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128d __lsx_vfrecip_d (__m128d a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfrecip.d vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute reciprocal of double precision floating point elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.fp64[i] = 1 / a.fp64[i];\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>8</td>\n<td>0.25(1/4)</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>23</td>\n<td>0.08(1/12)</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Computation", "extension": "LSX", "display": true}, {"name": "__m128d __lsx_vfrecipe_d (__m128d a)", "markdown": "### Synopsis\n\n```c++\n__m128d __lsx_vfrecipe_d (__m128d a)\n#include <lsxintrin.h>\nInstruction: vfrecipe.d vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompute estimated reciprocal of double precision floating point elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.fp64[i] = 1 / a.fp64[i]; // estimated\n}\n```", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128d __lsx_vfrecipe_d (__m128d a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfrecipe.d vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute estimated reciprocal of double precision floating point elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.fp64[i] = 1 / a.fp64[i]; // estimated\n}\n</code></pre>", "group": "Floating Point Computation", "extension": "LSX", "display": true}, {"name": "__m128d __lsx_vfrint_d (__m128d a)", "markdown": "### Synopsis\n\n```c++\n__m128d __lsx_vfrint_d (__m128d a)\n#include <lsxintrin.h>\nInstruction: vfrint.d vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nRound single-precision floating point elements in `a` to integers, using current rounding mode specified in `fscr`, and store as floating point numbers.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.fp64[i] = (fp64)(s64)a.fp64[i]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128d __lsx_vfrint_d (__m128d a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfrint.d vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Round single-precision floating point elements in <code>a</code> to integers, using current rounding mode specified in <code>fscr</code>, and store as floating point numbers.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.fp64[i] = (fp64)(s64)a.fp64[i]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Misc", "extension": "LSX", "display": true}, {"name": "__m128d __lsx_vfrintrm_d (__m128d a)", "markdown": "### Synopsis\n\n```c++\n__m128d __lsx_vfrintrm_d (__m128d a)\n#include <lsxintrin.h>\nInstruction: vfrintrm.d vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nRound single-precision floating point elements in `a` to integers, rounding towards negative infinity, and store as floating point numbers.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.fp64[i] = (fp64)(s64)a.fp64[i]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128d __lsx_vfrintrm_d (__m128d a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfrintrm.d vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Round single-precision floating point elements in <code>a</code> to integers, rounding towards negative infinity, and store as floating point numbers.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.fp64[i] = (fp64)(s64)a.fp64[i]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Misc", "extension": "LSX", "display": true}, {"name": "__m128d __lsx_vfrintrne_d (__m128d a)", "markdown": "### Synopsis\n\n```c++\n__m128d __lsx_vfrintrne_d (__m128d a)\n#include <lsxintrin.h>\nInstruction: vfrintrne.d vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nRound single-precision floating point elements in `a` to integers, rounding towards nearest even, and store as floating point numbers.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.fp64[i] = (fp64)(s64)a.fp64[i]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128d __lsx_vfrintrne_d (__m128d a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfrintrne.d vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Round single-precision floating point elements in <code>a</code> to integers, rounding towards nearest even, and store as floating point numbers.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.fp64[i] = (fp64)(s64)a.fp64[i]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Misc", "extension": "LSX", "display": true}, {"name": "__m128d __lsx_vfrintrp_d (__m128d a)", "markdown": "### Synopsis\n\n```c++\n__m128d __lsx_vfrintrp_d (__m128d a)\n#include <lsxintrin.h>\nInstruction: vfrintrp.d vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nRound single-precision floating point elements in `a` to integers, rounding towards positive infinity, and store as floating point numbers.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.fp64[i] = (fp64)(s64)a.fp64[i]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128d __lsx_vfrintrp_d (__m128d a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfrintrp.d vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Round single-precision floating point elements in <code>a</code> to integers, rounding towards positive infinity, and store as floating point numbers.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.fp64[i] = (fp64)(s64)a.fp64[i]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Misc", "extension": "LSX", "display": true}, {"name": "__m128d __lsx_vfrintrz_d (__m128d a)", "markdown": "### Synopsis\n\n```c++\n__m128d __lsx_vfrintrz_d (__m128d a)\n#include <lsxintrin.h>\nInstruction: vfrintrz.d vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nRound single-precision floating point elements in `a` to integers, rounding towards zero, and store as floating point numbers.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.fp64[i] = (fp64)(s64)a.fp64[i]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128d __lsx_vfrintrz_d (__m128d a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfrintrz.d vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Round single-precision floating point elements in <code>a</code> to integers, rounding towards zero, and store as floating point numbers.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.fp64[i] = (fp64)(s64)a.fp64[i]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Misc", "extension": "LSX", "display": true}, {"name": "__m128d __lsx_vfrsqrt_d (__m128d a)", "markdown": "### Synopsis\n\n```c++\n__m128d __lsx_vfrsqrt_d (__m128d a)\n#include <lsxintrin.h>\nInstruction: vfrsqrt.d vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompute reciprocal of square root of double precision floating point elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.fp64[i] = 1.0 / sqrt(a.fp64[i]);\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 15 | 0.04(1/26.5) |\n| 3C5000 | 15 | 0.04(1/27.5) |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128d __lsx_vfrsqrt_d (__m128d a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfrsqrt.d vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute reciprocal of square root of double precision floating point elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.fp64[i] = 1.0 / sqrt(a.fp64[i]);\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>15</td>\n<td>0.04(1/26.5)</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>15</td>\n<td>0.04(1/27.5)</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Computation", "extension": "LSX", "display": true}, {"name": "__m128d __lsx_vfrsqrte_d (__m128d a)", "markdown": "### Synopsis\n\n```c++\n__m128d __lsx_vfrsqrte_d (__m128d a)\n#include <lsxintrin.h>\nInstruction: vfrsqrte.d vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompute estimated reciprocal of square root of double precision floating point elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.fp64[i] = 1.0 / sqrt(a.fp64[i]); // estimated\n}\n```", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128d __lsx_vfrsqrte_d (__m128d a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfrsqrte.d vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute estimated reciprocal of square root of double precision floating point elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.fp64[i] = 1.0 / sqrt(a.fp64[i]); // estimated\n}\n</code></pre>", "group": "Floating Point Computation", "extension": "LSX", "display": true}, {"name": "__m128d __lsx_vfsqrt_d (__m128d a)", "markdown": "### Synopsis\n\n```c++\n__m128d __lsx_vfsqrt_d (__m128d a)\n#include <lsxintrin.h>\nInstruction: vfsqrt.d vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompute square root of double precision floating point elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.fp64[i] = sqrt(a.fp64[i]);\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 36 | 0.06(1/17.5) |\n| 3C5000 | 36 | 0.05(1/18.5) |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128d __lsx_vfsqrt_d (__m128d a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfsqrt.d vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute square root of double precision floating point elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.fp64[i] = sqrt(a.fp64[i]);\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>36</td>\n<td>0.06(1/17.5)</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>36</td>\n<td>0.05(1/18.5)</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Computation", "extension": "LSX", "display": true}, {"name": "__m128d __lsx_vfsub_d (__m128d a, __m128d b)", "markdown": "### Synopsis\n\n```c++\n__m128d __lsx_vfsub_d (__m128d a, __m128d b)\n#include <lsxintrin.h>\nInstruction: vfsub.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nSubtract double precision floating point elements in `a` by elements in `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.fp64[i] = a.fp64[i] - b.fp64[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 4 |\n| 3C5000 | 5 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128d __lsx_vfsub_d (__m128d a, __m128d b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfsub.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Subtract double precision floating point elements in <code>a</code> by elements in <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.fp64[i] = a.fp64[i] - b.fp64[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>5</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vabsd_b (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vabsd_b (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vabsd.b vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompute absolute difference of signed 8-bit elements in `a` and `b`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = ((s8)a.byte[i] > (s8)b.byte[i]) ? (a.byte[i] - b.byte[i])\n                                                : (b.byte[i] - a.byte[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vabsd_b (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vabsd.b vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute absolute difference of signed 8-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = ((s8)a.byte[i] &gt; (s8)b.byte[i]) ? (a.byte[i] - b.byte[i])\n                                                : (b.byte[i] - a.byte[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vabsd_bu (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vabsd_bu (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vabsd.bu vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompute absolute difference of unsigned 8-bit elements in `a` and `b`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = ((u8)a.byte[i] > (u8)b.byte[i]) ? (a.byte[i] - b.byte[i])\n                                                : (b.byte[i] - a.byte[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vabsd_bu (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vabsd.bu vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute absolute difference of unsigned 8-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = ((u8)a.byte[i] &gt; (u8)b.byte[i]) ? (a.byte[i] - b.byte[i])\n                                                : (b.byte[i] - a.byte[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vabsd_d (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vabsd_d (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vabsd.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompute absolute difference of signed 64-bit elements in `a` and `b`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = ((s64)a.dword[i] > (s64)b.dword[i])\n                     ? (a.dword[i] - b.dword[i])\n                     : (b.dword[i] - a.dword[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vabsd_d (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vabsd.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute absolute difference of signed 64-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = ((s64)a.dword[i] &gt; (s64)b.dword[i])\n                     ? (a.dword[i] - b.dword[i])\n                     : (b.dword[i] - a.dword[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vabsd_du (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vabsd_du (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vabsd.du vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompute absolute difference of unsigned 64-bit elements in `a` and `b`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = ((u64)a.dword[i] > (u64)b.dword[i])\n                     ? (a.dword[i] - b.dword[i])\n                     : (b.dword[i] - a.dword[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vabsd_du (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vabsd.du vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute absolute difference of unsigned 64-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = ((u64)a.dword[i] &gt; (u64)b.dword[i])\n                     ? (a.dword[i] - b.dword[i])\n                     : (b.dword[i] - a.dword[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vabsd_h (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vabsd_h (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vabsd.h vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompute absolute difference of signed 16-bit elements in `a` and `b`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = ((s16)a.half[i] > (s16)b.half[i]) ? (a.half[i] - b.half[i])\n                                                  : (b.half[i] - a.half[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vabsd_h (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vabsd.h vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute absolute difference of signed 16-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = ((s16)a.half[i] &gt; (s16)b.half[i]) ? (a.half[i] - b.half[i])\n                                                  : (b.half[i] - a.half[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vabsd_hu (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vabsd_hu (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vabsd.hu vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompute absolute difference of unsigned 16-bit elements in `a` and `b`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = ((u16)a.half[i] > (u16)b.half[i]) ? (a.half[i] - b.half[i])\n                                                  : (b.half[i] - a.half[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vabsd_hu (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vabsd.hu vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute absolute difference of unsigned 16-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = ((u16)a.half[i] &gt; (u16)b.half[i]) ? (a.half[i] - b.half[i])\n                                                  : (b.half[i] - a.half[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vabsd_w (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vabsd_w (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vabsd.w vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompute absolute difference of signed 32-bit elements in `a` and `b`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = ((s32)a.word[i] > (s32)b.word[i]) ? (a.word[i] - b.word[i])\n                                                  : (b.word[i] - a.word[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vabsd_w (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vabsd.w vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute absolute difference of signed 32-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = ((s32)a.word[i] &gt; (s32)b.word[i]) ? (a.word[i] - b.word[i])\n                                                  : (b.word[i] - a.word[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vabsd_wu (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vabsd_wu (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vabsd.wu vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompute absolute difference of unsigned 32-bit elements in `a` and `b`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = ((u32)a.word[i] > (u32)b.word[i]) ? (a.word[i] - b.word[i])\n                                                  : (b.word[i] - a.word[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vabsd_wu (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vabsd.wu vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute absolute difference of unsigned 32-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = ((u32)a.word[i] &gt; (u32)b.word[i]) ? (a.word[i] - b.word[i])\n                                                  : (b.word[i] - a.word[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vadd_b (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vadd_b (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vadd.b vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nAdd 8-bit elements in `a` and `b`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = a.byte[i] + b.byte[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vadd_b (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vadd.b vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add 8-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = a.byte[i] + b.byte[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vadd_d (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vadd_d (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vadd.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nAdd 64-bit elements in `a` and `b`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = a.dword[i] + b.dword[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vadd_d (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vadd.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add 64-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = a.dword[i] + b.dword[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vadd_h (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vadd_h (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vadd.h vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nAdd 16-bit elements in `a` and `b`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = a.half[i] + b.half[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vadd_h (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vadd.h vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add 16-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = a.half[i] + b.half[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vadd_q (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vadd_q (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vadd.q vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nAdd 128-bit elements in `a` and `b`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\ndst.qword[0] = a.qword[0] + b.qword[0];\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vadd_q (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vadd.q vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add 128-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">dst.qword[0] = a.qword[0] + b.qword[0];\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vadd_w (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vadd_w (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vadd.w vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nAdd 32-bit elements in `a` and `b`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = a.word[i] + b.word[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vadd_w (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vadd.w vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add 32-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = a.word[i] + b.word[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vadda_b (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vadda_b (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vadda.b vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nAdd absolute of 8-bit elements in `a` and `b`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = abs((s8)a.byte[i]) + abs((s8)b.byte[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vadda_b (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vadda.b vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add absolute of 8-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = abs((s8)a.byte[i]) + abs((s8)b.byte[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vadda_d (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vadda_d (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vadda.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nAdd absolute of 64-bit elements in `a` and `b`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = abs((s64)a.dword[i]) + abs((s64)b.dword[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vadda_d (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vadda.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add absolute of 64-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = abs((s64)a.dword[i]) + abs((s64)b.dword[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vadda_h (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vadda_h (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vadda.h vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nAdd absolute of 16-bit elements in `a` and `b`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = abs((s16)a.half[i]) + abs((s16)b.half[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vadda_h (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vadda.h vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add absolute of 16-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = abs((s16)a.half[i]) + abs((s16)b.half[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vadda_w (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vadda_w (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vadda.w vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nAdd absolute of 32-bit elements in `a` and `b`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = abs((s32)a.word[i]) + abs((s32)b.word[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vadda_w (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vadda.w vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add absolute of 32-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = abs((s32)a.word[i]) + abs((s32)b.word[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vaddi_bu (__m128i a, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vaddi_bu (__m128i a, imm0_31 imm)\n#include <lsxintrin.h>\nInstruction: vaddi.bu vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nAdd 8-bit elements in `a` and `imm`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = a.byte[i] + imm;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vaddi_bu (__m128i a, imm0_31 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vaddi.bu vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add 8-bit elements in <code>a</code> and <code>imm</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = a.byte[i] + imm;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vaddi_du (__m128i a, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vaddi_du (__m128i a, imm0_31 imm)\n#include <lsxintrin.h>\nInstruction: vaddi.du vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nAdd 64-bit elements in `a` and `imm`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = a.dword[i] + imm;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vaddi_du (__m128i a, imm0_31 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vaddi.du vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add 64-bit elements in <code>a</code> and <code>imm</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = a.dword[i] + imm;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vaddi_hu (__m128i a, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vaddi_hu (__m128i a, imm0_31 imm)\n#include <lsxintrin.h>\nInstruction: vaddi.hu vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nAdd 16-bit elements in `a` and `imm`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = a.half[i] + imm;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vaddi_hu (__m128i a, imm0_31 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vaddi.hu vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add 16-bit elements in <code>a</code> and <code>imm</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = a.half[i] + imm;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vaddi_wu (__m128i a, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vaddi_wu (__m128i a, imm0_31 imm)\n#include <lsxintrin.h>\nInstruction: vaddi.wu vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nAdd 32-bit elements in `a` and `imm`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = a.word[i] + imm;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vaddi_wu (__m128i a, imm0_31 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vaddi.wu vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add 32-bit elements in <code>a</code> and <code>imm</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = a.word[i] + imm;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vaddwev_d_w (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vaddwev_d_w (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vaddwev.d.w vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nAdd even-positioned signed 32-bit elements in `a` and signed elements in `b`, save the 64-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = (s64)(s32)a.word[2 * i] + (s64)(s32)b.word[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vaddwev_d_w (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vaddwev.d.w vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add even-positioned signed 32-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 64-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = (s64)(s32)a.word[2 * i] + (s64)(s32)b.word[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vaddwev_d_wu (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vaddwev_d_wu (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vaddwev.d.wu vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nAdd even-positioned unsigned 32-bit elements in `a` and unsigned elements in `b`, save the 64-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = (u64)(u32)a.word[2 * i] + (u64)(u32)b.word[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vaddwev_d_wu (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vaddwev.d.wu vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add even-positioned unsigned 32-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 64-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = (u64)(u32)a.word[2 * i] + (u64)(u32)b.word[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vaddwev_d_wu_w (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vaddwev_d_wu_w (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vaddwev.d.wu.w vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nAdd even-positioned unsigned 32-bit elements in `a` and signed elements in `b`, save the 64-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = (u64)(u32)a.word[2 * i] + (s64)(s32)b.word[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vaddwev_d_wu_w (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vaddwev.d.wu.w vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add even-positioned unsigned 32-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 64-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = (u64)(u32)a.word[2 * i] + (s64)(s32)b.word[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vaddwev_h_b (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vaddwev_h_b (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vaddwev.h.b vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nAdd even-positioned signed 8-bit elements in `a` and signed elements in `b`, save the 16-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = (s16)(s8)a.byte[2 * i] + (s16)(s8)b.byte[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vaddwev_h_b (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vaddwev.h.b vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add even-positioned signed 8-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 16-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = (s16)(s8)a.byte[2 * i] + (s16)(s8)b.byte[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vaddwev_h_bu (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vaddwev_h_bu (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vaddwev.h.bu vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nAdd even-positioned unsigned 8-bit elements in `a` and unsigned elements in `b`, save the 16-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = (u16)(u8)a.byte[2 * i] + (u16)(u8)b.byte[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vaddwev_h_bu (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vaddwev.h.bu vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add even-positioned unsigned 8-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 16-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = (u16)(u8)a.byte[2 * i] + (u16)(u8)b.byte[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vaddwev_h_bu_b (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vaddwev_h_bu_b (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vaddwev.h.bu.b vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nAdd even-positioned unsigned 8-bit elements in `a` and signed elements in `b`, save the 16-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = (u16)(u8)a.byte[2 * i] + (s16)(s8)b.byte[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vaddwev_h_bu_b (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vaddwev.h.bu.b vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add even-positioned unsigned 8-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 16-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = (u16)(u8)a.byte[2 * i] + (s16)(s8)b.byte[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vaddwev_q_d (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vaddwev_q_d (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vaddwev.q.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nAdd even-positioned signed 64-bit elements in `a` and signed elements in `b`, save the 128-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 1; i++) {\n  dst.qword[i] = (s128)(s64)a.dword[2 * i] + (s128)(s64)b.dword[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vaddwev_q_d (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vaddwev.q.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add even-positioned signed 64-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 128-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 1; i++) {\n  dst.qword[i] = (s128)(s64)a.dword[2 * i] + (s128)(s64)b.dword[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vaddwev_q_du (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vaddwev_q_du (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vaddwev.q.du vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nAdd even-positioned unsigned 64-bit elements in `a` and unsigned elements in `b`, save the 128-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 1; i++) {\n  dst.qword[i] = (u128)(u64)a.dword[2 * i] + (u128)(u64)b.dword[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vaddwev_q_du (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vaddwev.q.du vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add even-positioned unsigned 64-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 128-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 1; i++) {\n  dst.qword[i] = (u128)(u64)a.dword[2 * i] + (u128)(u64)b.dword[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vaddwev_q_du_d (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vaddwev_q_du_d (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vaddwev.q.du.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nAdd even-positioned unsigned 64-bit elements in `a` and signed elements in `b`, save the 128-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 1; i++) {\n  dst.qword[i] = (u128)(u64)a.dword[2 * i] + (s128)(s64)b.dword[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vaddwev_q_du_d (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vaddwev.q.du.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add even-positioned unsigned 64-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 128-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 1; i++) {\n  dst.qword[i] = (u128)(u64)a.dword[2 * i] + (s128)(s64)b.dword[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vaddwev_w_h (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vaddwev_w_h (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vaddwev.w.h vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nAdd even-positioned signed 16-bit elements in `a` and signed elements in `b`, save the 32-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = (s32)(s16)a.half[2 * i] + (s32)(s16)b.half[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vaddwev_w_h (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vaddwev.w.h vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add even-positioned signed 16-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 32-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = (s32)(s16)a.half[2 * i] + (s32)(s16)b.half[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vaddwev_w_hu (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vaddwev_w_hu (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vaddwev.w.hu vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nAdd even-positioned unsigned 16-bit elements in `a` and unsigned elements in `b`, save the 32-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = (u32)(u16)a.half[2 * i] + (u32)(u16)b.half[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vaddwev_w_hu (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vaddwev.w.hu vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add even-positioned unsigned 16-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 32-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = (u32)(u16)a.half[2 * i] + (u32)(u16)b.half[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vaddwev_w_hu_h (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vaddwev_w_hu_h (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vaddwev.w.hu.h vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nAdd even-positioned unsigned 16-bit elements in `a` and signed elements in `b`, save the 32-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = (u32)(u16)a.half[2 * i] + (s32)(s16)b.half[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vaddwev_w_hu_h (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vaddwev.w.hu.h vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add even-positioned unsigned 16-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 32-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = (u32)(u16)a.half[2 * i] + (s32)(s16)b.half[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vaddwod_d_w (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vaddwod_d_w (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vaddwod.d.w vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nAdd odd-positioned signed 32-bit elements in `a` and signed elements in `b`, save the 64-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = (s64)(s32)a.word[2 * i + 1] + (s64)(s32)b.word[2 * i + 1];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vaddwod_d_w (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vaddwod.d.w vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add odd-positioned signed 32-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 64-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = (s64)(s32)a.word[2 * i + 1] + (s64)(s32)b.word[2 * i + 1];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vaddwod_d_wu (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vaddwod_d_wu (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vaddwod.d.wu vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nAdd odd-positioned unsigned 32-bit elements in `a` and unsigned elements in `b`, save the 64-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = (u64)(u32)a.word[2 * i + 1] + (u64)(u32)b.word[2 * i + 1];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vaddwod_d_wu (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vaddwod.d.wu vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add odd-positioned unsigned 32-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 64-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = (u64)(u32)a.word[2 * i + 1] + (u64)(u32)b.word[2 * i + 1];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vaddwod_d_wu_w (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vaddwod_d_wu_w (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vaddwod.d.wu.w vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nAdd odd-positioned unsigned 32-bit elements in `a` and signed elements in `b`, save the 64-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = (u64)(u32)a.word[2 * i + 1] + (s64)(s32)b.word[2 * i + 1];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vaddwod_d_wu_w (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vaddwod.d.wu.w vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add odd-positioned unsigned 32-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 64-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = (u64)(u32)a.word[2 * i + 1] + (s64)(s32)b.word[2 * i + 1];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vaddwod_h_b (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vaddwod_h_b (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vaddwod.h.b vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nAdd odd-positioned signed 8-bit elements in `a` and signed elements in `b`, save the 16-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = (s16)(s8)a.byte[2 * i + 1] + (s16)(s8)b.byte[2 * i + 1];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vaddwod_h_b (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vaddwod.h.b vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add odd-positioned signed 8-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 16-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = (s16)(s8)a.byte[2 * i + 1] + (s16)(s8)b.byte[2 * i + 1];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vaddwod_h_bu (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vaddwod_h_bu (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vaddwod.h.bu vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nAdd odd-positioned unsigned 8-bit elements in `a` and unsigned elements in `b`, save the 16-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = (u16)(u8)a.byte[2 * i + 1] + (u16)(u8)b.byte[2 * i + 1];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vaddwod_h_bu (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vaddwod.h.bu vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add odd-positioned unsigned 8-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 16-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = (u16)(u8)a.byte[2 * i + 1] + (u16)(u8)b.byte[2 * i + 1];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vaddwod_h_bu_b (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vaddwod_h_bu_b (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vaddwod.h.bu.b vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nAdd odd-positioned unsigned 8-bit elements in `a` and signed elements in `b`, save the 16-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = (u16)(u8)a.byte[2 * i + 1] + (s16)(s8)b.byte[2 * i + 1];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vaddwod_h_bu_b (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vaddwod.h.bu.b vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add odd-positioned unsigned 8-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 16-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = (u16)(u8)a.byte[2 * i + 1] + (s16)(s8)b.byte[2 * i + 1];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vaddwod_q_d (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vaddwod_q_d (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vaddwod.q.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nAdd odd-positioned signed 64-bit elements in `a` and signed elements in `b`, save the 128-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 1; i++) {\n  dst.qword[i] = (s128)(s64)a.dword[2 * i + 1] + (s128)(s64)b.dword[2 * i + 1];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vaddwod_q_d (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vaddwod.q.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add odd-positioned signed 64-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 128-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 1; i++) {\n  dst.qword[i] = (s128)(s64)a.dword[2 * i + 1] + (s128)(s64)b.dword[2 * i + 1];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vaddwod_q_du (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vaddwod_q_du (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vaddwod.q.du vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nAdd odd-positioned unsigned 64-bit elements in `a` and unsigned elements in `b`, save the 128-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 1; i++) {\n  dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] + (u128)(u64)b.dword[2 * i + 1];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vaddwod_q_du (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vaddwod.q.du vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add odd-positioned unsigned 64-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 128-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 1; i++) {\n  dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] + (u128)(u64)b.dword[2 * i + 1];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vaddwod_q_du_d (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vaddwod_q_du_d (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vaddwod.q.du.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nAdd odd-positioned unsigned 64-bit elements in `a` and signed elements in `b`, save the 128-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 1; i++) {\n  dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] + (s128)(s64)b.dword[2 * i + 1];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vaddwod_q_du_d (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vaddwod.q.du.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add odd-positioned unsigned 64-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 128-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 1; i++) {\n  dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] + (s128)(s64)b.dword[2 * i + 1];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vaddwod_w_h (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vaddwod_w_h (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vaddwod.w.h vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nAdd odd-positioned signed 16-bit elements in `a` and signed elements in `b`, save the 32-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = (s32)(s16)a.half[2 * i + 1] + (s32)(s16)b.half[2 * i + 1];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vaddwod_w_h (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vaddwod.w.h vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add odd-positioned signed 16-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 32-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = (s32)(s16)a.half[2 * i + 1] + (s32)(s16)b.half[2 * i + 1];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vaddwod_w_hu (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vaddwod_w_hu (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vaddwod.w.hu vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nAdd odd-positioned unsigned 16-bit elements in `a` and unsigned elements in `b`, save the 32-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = (u32)(u16)a.half[2 * i + 1] + (u32)(u16)b.half[2 * i + 1];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vaddwod_w_hu (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vaddwod.w.hu vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add odd-positioned unsigned 16-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 32-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = (u32)(u16)a.half[2 * i + 1] + (u32)(u16)b.half[2 * i + 1];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vaddwod_w_hu_h (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vaddwod_w_hu_h (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vaddwod.w.hu.h vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nAdd odd-positioned unsigned 16-bit elements in `a` and signed elements in `b`, save the 32-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = (u32)(u16)a.half[2 * i + 1] + (s32)(s16)b.half[2 * i + 1];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vaddwod_w_hu_h (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vaddwod.w.hu.h vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add odd-positioned unsigned 16-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 32-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = (u32)(u16)a.half[2 * i + 1] + (s32)(s16)b.half[2 * i + 1];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vand_v (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vand_v (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vand.v vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompute bitwise AND between elements in `a` and `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = a.dword[i] & b.dword[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vand_v (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vand.v vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute bitwise AND between elements in <code>a</code> and <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = a.dword[i] &amp; b.dword[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Logical", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vandi_b (__m128i a, imm0_255 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vandi_b (__m128i a, imm0_255 imm)\n#include <lsxintrin.h>\nInstruction: vandi.b vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nCompute bitwise AND between elements in `a` and `imm`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = a.byte[i] & imm;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vandi_b (__m128i a, imm0_255 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vandi.b vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute bitwise AND between elements in <code>a</code> and <code>imm</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = a.byte[i] &amp; imm;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Logical", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vandn_v (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vandn_v (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vandn.v vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompute bitwise ANDN between elements in `a` and `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = b.dword[i] & (~a.dword[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vandn_v (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vandn.v vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute bitwise ANDN between elements in <code>a</code> and <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = b.dword[i] &amp; (~a.dword[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Logical", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vavg_b (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vavg_b (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vavg.b vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompute the average (rounded towards negative infinity) of signed 8-bit elements in `a` and `b`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = ((s8)a.byte[i] >> 1) + ((s8)b.byte[i] >> 1) +\n                ((a.byte[i] & b.byte[i]) & 1);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vavg_b (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vavg.b vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute the average (rounded towards negative infinity) of signed 8-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = ((s8)a.byte[i] &gt;&gt; 1) + ((s8)b.byte[i] &gt;&gt; 1) +\n                ((a.byte[i] &amp; b.byte[i]) &amp; 1);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vavg_bu (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vavg_bu (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vavg.bu vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompute the average (rounded towards negative infinity) of unsigned 8-bit elements in `a` and `b`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = ((u8)a.byte[i] >> 1) + ((u8)b.byte[i] >> 1) +\n                ((a.byte[i] & b.byte[i]) & 1);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vavg_bu (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vavg.bu vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute the average (rounded towards negative infinity) of unsigned 8-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = ((u8)a.byte[i] &gt;&gt; 1) + ((u8)b.byte[i] &gt;&gt; 1) +\n                ((a.byte[i] &amp; b.byte[i]) &amp; 1);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vavg_d (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vavg_d (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vavg.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompute the average (rounded towards negative infinity) of signed 64-bit elements in `a` and `b`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = ((s64)a.dword[i] >> 1) + ((s64)b.dword[i] >> 1) +\n                 ((a.dword[i] & b.dword[i]) & 1);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vavg_d (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vavg.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute the average (rounded towards negative infinity) of signed 64-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = ((s64)a.dword[i] &gt;&gt; 1) + ((s64)b.dword[i] &gt;&gt; 1) +\n                 ((a.dword[i] &amp; b.dword[i]) &amp; 1);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vavg_du (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vavg_du (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vavg.du vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompute the average (rounded towards negative infinity) of unsigned 64-bit elements in `a` and `b`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = ((u64)a.dword[i] >> 1) + ((u64)b.dword[i] >> 1) +\n                 ((a.dword[i] & b.dword[i]) & 1);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vavg_du (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vavg.du vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute the average (rounded towards negative infinity) of unsigned 64-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = ((u64)a.dword[i] &gt;&gt; 1) + ((u64)b.dword[i] &gt;&gt; 1) +\n                 ((a.dword[i] &amp; b.dword[i]) &amp; 1);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vavg_h (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vavg_h (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vavg.h vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompute the average (rounded towards negative infinity) of signed 16-bit elements in `a` and `b`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = ((s16)a.half[i] >> 1) + ((s16)b.half[i] >> 1) +\n                ((a.half[i] & b.half[i]) & 1);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vavg_h (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vavg.h vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute the average (rounded towards negative infinity) of signed 16-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = ((s16)a.half[i] &gt;&gt; 1) + ((s16)b.half[i] &gt;&gt; 1) +\n                ((a.half[i] &amp; b.half[i]) &amp; 1);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vavg_hu (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vavg_hu (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vavg.hu vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompute the average (rounded towards negative infinity) of unsigned 16-bit elements in `a` and `b`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = ((u16)a.half[i] >> 1) + ((u16)b.half[i] >> 1) +\n                ((a.half[i] & b.half[i]) & 1);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vavg_hu (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vavg.hu vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute the average (rounded towards negative infinity) of unsigned 16-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = ((u16)a.half[i] &gt;&gt; 1) + ((u16)b.half[i] &gt;&gt; 1) +\n                ((a.half[i] &amp; b.half[i]) &amp; 1);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vavg_w (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vavg_w (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vavg.w vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompute the average (rounded towards negative infinity) of signed 32-bit elements in `a` and `b`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = ((s32)a.word[i] >> 1) + ((s32)b.word[i] >> 1) +\n                ((a.word[i] & b.word[i]) & 1);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vavg_w (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vavg.w vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute the average (rounded towards negative infinity) of signed 32-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = ((s32)a.word[i] &gt;&gt; 1) + ((s32)b.word[i] &gt;&gt; 1) +\n                ((a.word[i] &amp; b.word[i]) &amp; 1);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vavg_wu (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vavg_wu (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vavg.wu vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompute the average (rounded towards negative infinity) of unsigned 32-bit elements in `a` and `b`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = ((u32)a.word[i] >> 1) + ((u32)b.word[i] >> 1) +\n                ((a.word[i] & b.word[i]) & 1);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vavg_wu (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vavg.wu vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute the average (rounded towards negative infinity) of unsigned 32-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = ((u32)a.word[i] &gt;&gt; 1) + ((u32)b.word[i] &gt;&gt; 1) +\n                ((a.word[i] &amp; b.word[i]) &amp; 1);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vavgr_b (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vavgr_b (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vavgr.b vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompute the average (rounded towards positive infinity) of signed 8-bit elements in `a` and `b`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = ((s8)a.byte[i] >> 1) + ((s8)b.byte[i] >> 1) +\n                ((a.byte[i] | b.byte[i]) & 1);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vavgr_b (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vavgr.b vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute the average (rounded towards positive infinity) of signed 8-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = ((s8)a.byte[i] &gt;&gt; 1) + ((s8)b.byte[i] &gt;&gt; 1) +\n                ((a.byte[i] | b.byte[i]) &amp; 1);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vavgr_bu (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vavgr_bu (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vavgr.bu vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompute the average (rounded towards positive infinity) of unsigned 8-bit elements in `a` and `b`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = ((u8)a.byte[i] >> 1) + ((u8)b.byte[i] >> 1) +\n                ((a.byte[i] | b.byte[i]) & 1);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vavgr_bu (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vavgr.bu vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute the average (rounded towards positive infinity) of unsigned 8-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = ((u8)a.byte[i] &gt;&gt; 1) + ((u8)b.byte[i] &gt;&gt; 1) +\n                ((a.byte[i] | b.byte[i]) &amp; 1);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vavgr_d (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vavgr_d (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vavgr.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompute the average (rounded towards positive infinity) of signed 64-bit elements in `a` and `b`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = ((s64)a.dword[i] >> 1) + ((s64)b.dword[i] >> 1) +\n                 ((a.dword[i] | b.dword[i]) & 1);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vavgr_d (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vavgr.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute the average (rounded towards positive infinity) of signed 64-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = ((s64)a.dword[i] &gt;&gt; 1) + ((s64)b.dword[i] &gt;&gt; 1) +\n                 ((a.dword[i] | b.dword[i]) &amp; 1);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vavgr_du (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vavgr_du (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vavgr.du vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompute the average (rounded towards positive infinity) of unsigned 64-bit elements in `a` and `b`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = ((u64)a.dword[i] >> 1) + ((u64)b.dword[i] >> 1) +\n                 ((a.dword[i] | b.dword[i]) & 1);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vavgr_du (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vavgr.du vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute the average (rounded towards positive infinity) of unsigned 64-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = ((u64)a.dword[i] &gt;&gt; 1) + ((u64)b.dword[i] &gt;&gt; 1) +\n                 ((a.dword[i] | b.dword[i]) &amp; 1);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vavgr_h (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vavgr_h (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vavgr.h vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompute the average (rounded towards positive infinity) of signed 16-bit elements in `a` and `b`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = ((s16)a.half[i] >> 1) + ((s16)b.half[i] >> 1) +\n                ((a.half[i] | b.half[i]) & 1);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vavgr_h (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vavgr.h vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute the average (rounded towards positive infinity) of signed 16-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = ((s16)a.half[i] &gt;&gt; 1) + ((s16)b.half[i] &gt;&gt; 1) +\n                ((a.half[i] | b.half[i]) &amp; 1);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vavgr_hu (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vavgr_hu (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vavgr.hu vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompute the average (rounded towards positive infinity) of unsigned 16-bit elements in `a` and `b`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = ((u16)a.half[i] >> 1) + ((u16)b.half[i] >> 1) +\n                ((a.half[i] | b.half[i]) & 1);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vavgr_hu (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vavgr.hu vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute the average (rounded towards positive infinity) of unsigned 16-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = ((u16)a.half[i] &gt;&gt; 1) + ((u16)b.half[i] &gt;&gt; 1) +\n                ((a.half[i] | b.half[i]) &amp; 1);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vavgr_w (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vavgr_w (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vavgr.w vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompute the average (rounded towards positive infinity) of signed 32-bit elements in `a` and `b`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = ((s32)a.word[i] >> 1) + ((s32)b.word[i] >> 1) +\n                ((a.word[i] | b.word[i]) & 1);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vavgr_w (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vavgr.w vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute the average (rounded towards positive infinity) of signed 32-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = ((s32)a.word[i] &gt;&gt; 1) + ((s32)b.word[i] &gt;&gt; 1) +\n                ((a.word[i] | b.word[i]) &amp; 1);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vavgr_wu (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vavgr_wu (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vavgr.wu vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompute the average (rounded towards positive infinity) of unsigned 32-bit elements in `a` and `b`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = ((u32)a.word[i] >> 1) + ((u32)b.word[i] >> 1) +\n                ((a.word[i] | b.word[i]) & 1);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vavgr_wu (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vavgr.wu vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute the average (rounded towards positive infinity) of unsigned 32-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = ((u32)a.word[i] &gt;&gt; 1) + ((u32)b.word[i] &gt;&gt; 1) +\n                ((a.word[i] | b.word[i]) &amp; 1);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vbitclr_b (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vbitclr_b (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vbitclr.b vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nClear the bit specified by elements in `b` from 8-bit elements in `a`, save the result in `dst`.\n\n\n\n\n### Examples\n\n```c++\n__m128i __lsx_vbitclr_b(__m128i{0xffffffffffffffff, 0x99aabbccddeeff00}, __m128i{0xabababababababab, 0x1234123443214321})\n= 0xf7f7f7f7f7f7f7f7 0x99aabbccd5ecf700\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = a.byte[i] & (~((u8)1 << (b.byte[i] % 8)));\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vbitclr_b (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vbitclr.b vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Clear the bit specified by elements in <code>b</code> from 8-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vbitclr_b(__m128i{0xffffffffffffffff, 0x99aabbccddeeff00}, __m128i{0xabababababababab, 0x1234123443214321})\n= 0xf7f7f7f7f7f7f7f7 0x99aabbccd5ecf700\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = a.byte[i] &amp; (~((u8)1 &lt;&lt; (b.byte[i] % 8)));\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Bitwise Operations", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vbitclr_d (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vbitclr_d (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vbitclr.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nClear the bit specified by elements in `b` from 64-bit elements in `a`, save the result in `dst`.\n\n\n\n\n### Examples\n\n```c++\n__m128i __lsx_vbitclr_d(__m128i{0xffffffffffffffff, 0x99aabbccddeeff00}, __m128i{0xabababababababab, 0x1234123443214321})\n= 0xfffff7ffffffffff 0x99aabbccddeeff00\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = a.dword[i] & (~((u64)1 << (b.dword[i] % 64)));\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vbitclr_d (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vbitclr.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Clear the bit specified by elements in <code>b</code> from 64-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vbitclr_d(__m128i{0xffffffffffffffff, 0x99aabbccddeeff00}, __m128i{0xabababababababab, 0x1234123443214321})\n= 0xfffff7ffffffffff 0x99aabbccddeeff00\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = a.dword[i] &amp; (~((u64)1 &lt;&lt; (b.dword[i] % 64)));\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Bitwise Operations", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vbitclr_h (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vbitclr_h (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vbitclr.h vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nClear the bit specified by elements in `b` from 16-bit elements in `a`, save the result in `dst`.\n\n\n\n\n### Examples\n\n```c++\n__m128i __lsx_vbitclr_h(__m128i{0xffffffffffffffff, 0x99aabbccddeeff00}, __m128i{0xabababababababab, 0x1234123443214321})\n= 0xf7fff7fff7fff7ff 0x99aabbccddecff00\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = a.half[i] & (~((u16)1 << (b.half[i] % 16)));\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vbitclr_h (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vbitclr.h vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Clear the bit specified by elements in <code>b</code> from 16-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vbitclr_h(__m128i{0xffffffffffffffff, 0x99aabbccddeeff00}, __m128i{0xabababababababab, 0x1234123443214321})\n= 0xf7fff7fff7fff7ff 0x99aabbccddecff00\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = a.half[i] &amp; (~((u16)1 &lt;&lt; (b.half[i] % 16)));\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Bitwise Operations", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vbitclr_w (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vbitclr_w (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vbitclr.w vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nClear the bit specified by elements in `b` from 32-bit elements in `a`, save the result in `dst`.\n\n\n\n\n### Examples\n\n```c++\n__m128i __lsx_vbitclr_w(__m128i{0xffffffffffffffff, 0x99aabbccddeeff00}, __m128i{0xabababababababab, 0x1234123443214321})\n= 0xfffff7fffffff7ff 0x99aabbccddeeff00\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = a.word[i] & (~((u32)1 << (b.word[i] % 32)));\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vbitclr_w (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vbitclr.w vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Clear the bit specified by elements in <code>b</code> from 32-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vbitclr_w(__m128i{0xffffffffffffffff, 0x99aabbccddeeff00}, __m128i{0xabababababababab, 0x1234123443214321})\n= 0xfffff7fffffff7ff 0x99aabbccddeeff00\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = a.word[i] &amp; (~((u32)1 &lt;&lt; (b.word[i] % 32)));\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Bitwise Operations", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vbitclri_b (__m128i a, imm0_7 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vbitclri_b (__m128i a, imm0_7 imm)\n#include <lsxintrin.h>\nInstruction: vbitclri.b vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nClear the bit specified by `imm` from 8-bit elements in `a`, save the result in `dst`.\n\n\n\n\n### Examples\n\n```c++\n__m128i __lsx_vbitclri_b(__m128i{0xffffffffffffffff, 0x99aabbccddeeff00}, 1)\n= 0xfdfdfdfdfdfdfdfd 0x99a8b9ccddecfd00\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = a.byte[i] & (~((u8)1 << imm));\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vbitclri_b (__m128i a, imm0_7 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vbitclri.b vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Clear the bit specified by <code>imm</code> from 8-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vbitclri_b(__m128i{0xffffffffffffffff, 0x99aabbccddeeff00}, 1)\n= 0xfdfdfdfdfdfdfdfd 0x99a8b9ccddecfd00\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = a.byte[i] &amp; (~((u8)1 &lt;&lt; imm));\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Bitwise Operations", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vbitclri_d (__m128i a, imm0_63 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vbitclri_d (__m128i a, imm0_63 imm)\n#include <lsxintrin.h>\nInstruction: vbitclri.d vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nClear the bit specified by `imm` from 64-bit elements in `a`, save the result in `dst`.\n\n\n\n\n### Examples\n\n```c++\n__m128i __lsx_vbitclri_d(__m128i{0xffffffffffffffff, 0x99aabbccddeeff00}, 1)\n= 0xfffffffffffffffd 0x99aabbccddeeff00\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = a.dword[i] & (~((u64)1 << imm));\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vbitclri_d (__m128i a, imm0_63 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vbitclri.d vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Clear the bit specified by <code>imm</code> from 64-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vbitclri_d(__m128i{0xffffffffffffffff, 0x99aabbccddeeff00}, 1)\n= 0xfffffffffffffffd 0x99aabbccddeeff00\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = a.dword[i] &amp; (~((u64)1 &lt;&lt; imm));\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Bitwise Operations", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vbitclri_h (__m128i a, imm0_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vbitclri_h (__m128i a, imm0_15 imm)\n#include <lsxintrin.h>\nInstruction: vbitclri.h vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nClear the bit specified by `imm` from 16-bit elements in `a`, save the result in `dst`.\n\n\n\n\n### Examples\n\n```c++\n__m128i __lsx_vbitclri_h(__m128i{0xffffffffffffffff, 0x99aabbccddeeff00}, 1)\n= 0xfffdfffdfffdfffd 0x99a8bbccddecff00\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = a.half[i] & (~((u16)1 << imm));\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vbitclri_h (__m128i a, imm0_15 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vbitclri.h vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Clear the bit specified by <code>imm</code> from 16-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vbitclri_h(__m128i{0xffffffffffffffff, 0x99aabbccddeeff00}, 1)\n= 0xfffdfffdfffdfffd 0x99a8bbccddecff00\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = a.half[i] &amp; (~((u16)1 &lt;&lt; imm));\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Bitwise Operations", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vbitclri_w (__m128i a, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vbitclri_w (__m128i a, imm0_31 imm)\n#include <lsxintrin.h>\nInstruction: vbitclri.w vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nClear the bit specified by `imm` from 32-bit elements in `a`, save the result in `dst`.\n\n\n\n\n### Examples\n\n```c++\n__m128i __lsx_vbitclri_w(__m128i{0xffffffffffffffff, 0x99aabbccddeeff00}, 1)\n= 0xfffffffdfffffffd 0x99aabbccddeeff00\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = a.word[i] & (~((u32)1 << imm));\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vbitclri_w (__m128i a, imm0_31 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vbitclri.w vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Clear the bit specified by <code>imm</code> from 32-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vbitclri_w(__m128i{0xffffffffffffffff, 0x99aabbccddeeff00}, 1)\n= 0xfffffffdfffffffd 0x99aabbccddeeff00\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = a.word[i] &amp; (~((u32)1 &lt;&lt; imm));\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Bitwise Operations", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vbitrev_b (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vbitrev_b (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vbitrev.b vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nToggle the bit specified by elements in `b` from 8-bit elements in `a`, save the result in `dst`.\n\n\n\n\n### Examples\n\n```c++\n__m128i __lsx_vbitrev_b(__m128i{0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00}, __m128i{0xabababababababab, 0x1234123443214321})\n= 0x0707070707070707 0x9dbabfdcd5ecf702\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = a.byte[i] ^ ((u8)1 << (b.byte[i] % 8));\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vbitrev_b (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vbitrev.b vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Toggle the bit specified by elements in <code>b</code> from 8-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vbitrev_b(__m128i{0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00}, __m128i{0xabababababababab, 0x1234123443214321})\n= 0x0707070707070707 0x9dbabfdcd5ecf702\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = a.byte[i] ^ ((u8)1 &lt;&lt; (b.byte[i] % 8));\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Bitwise Operations", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vbitrev_d (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vbitrev_d (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vbitrev.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nToggle the bit specified by elements in `b` from 64-bit elements in `a`, save the result in `dst`.\n\n\n\n\n### Examples\n\n```c++\n__m128i __lsx_vbitrev_d(__m128i{0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00}, __m128i{0xabababababababab, 0x1234123443214321})\n= 0x0f0f070f0f0f0f0f 0x99aabbceddeeff00\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = a.dword[i] ^ ((u64)1 << (b.dword[i] % 64));\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vbitrev_d (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vbitrev.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Toggle the bit specified by elements in <code>b</code> from 64-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vbitrev_d(__m128i{0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00}, __m128i{0xabababababababab, 0x1234123443214321})\n= 0x0f0f070f0f0f0f0f 0x99aabbceddeeff00\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = a.dword[i] ^ ((u64)1 &lt;&lt; (b.dword[i] % 64));\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Bitwise Operations", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vbitrev_h (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vbitrev_h (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vbitrev.h vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nToggle the bit specified by elements in `b` from 16-bit elements in `a`, save the result in `dst`.\n\n\n\n\n### Examples\n\n```c++\n__m128i __lsx_vbitrev_h(__m128i{0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00}, __m128i{0xabababababababab, 0x1234123443214321})\n= 0x070f070f070f070f 0x99babbdcddecff02\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = a.half[i] ^ ((u16)1 << (b.half[i] % 16));\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vbitrev_h (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vbitrev.h vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Toggle the bit specified by elements in <code>b</code> from 16-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vbitrev_h(__m128i{0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00}, __m128i{0xabababababababab, 0x1234123443214321})\n= 0x070f070f070f070f 0x99babbdcddecff02\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = a.half[i] ^ ((u16)1 &lt;&lt; (b.half[i] % 16));\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Bitwise Operations", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vbitrev_w (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vbitrev_w (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vbitrev.w vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nToggle the bit specified by elements in `b` from 32-bit elements in `a`, save the result in `dst`.\n\n\n\n\n### Examples\n\n```c++\n__m128i __lsx_vbitrev_w(__m128i{0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00}, __m128i{0xabababababababab, 0x1234123443214321})\n= 0x0f0f070f0f0f070f 0x99babbccddeeff02\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = a.word[i] ^ ((u32)1 << (b.word[i] % 32));\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vbitrev_w (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vbitrev.w vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Toggle the bit specified by elements in <code>b</code> from 32-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vbitrev_w(__m128i{0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00}, __m128i{0xabababababababab, 0x1234123443214321})\n= 0x0f0f070f0f0f070f 0x99babbccddeeff02\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = a.word[i] ^ ((u32)1 &lt;&lt; (b.word[i] % 32));\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Bitwise Operations", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vbitrevi_b (__m128i a, imm0_7 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vbitrevi_b (__m128i a, imm0_7 imm)\n#include <lsxintrin.h>\nInstruction: vbitrevi.b vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nToggle the bit specified by `imm` from 8-bit elements in `a`, save the result in `dst`.\n\n\n\n\n### Examples\n\n```c++\n__m128i __lsx_vbitrevi_b(__m128i{0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00}, 1)\n= 0x0d0d0d0d0d0d0d0d 0x9ba8b9cedfecfd02\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = a.byte[i] ^ ((u8)1 << imm);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vbitrevi_b (__m128i a, imm0_7 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vbitrevi.b vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Toggle the bit specified by <code>imm</code> from 8-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vbitrevi_b(__m128i{0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00}, 1)\n= 0x0d0d0d0d0d0d0d0d 0x9ba8b9cedfecfd02\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = a.byte[i] ^ ((u8)1 &lt;&lt; imm);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Bitwise Operations", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vbitrevi_d (__m128i a, imm0_63 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vbitrevi_d (__m128i a, imm0_63 imm)\n#include <lsxintrin.h>\nInstruction: vbitrevi.d vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nToggle the bit specified by `imm` from 64-bit elements in `a`, save the result in `dst`.\n\n\n\n\n### Examples\n\n```c++\n__m128i __lsx_vbitrevi_d(__m128i{0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00}, 1)\n= 0x0f0f0f0f0f0f0f0d 0x99aabbccddeeff02\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = a.dword[i] ^ ((u64)1 << imm);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vbitrevi_d (__m128i a, imm0_63 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vbitrevi.d vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Toggle the bit specified by <code>imm</code> from 64-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vbitrevi_d(__m128i{0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00}, 1)\n= 0x0f0f0f0f0f0f0f0d 0x99aabbccddeeff02\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = a.dword[i] ^ ((u64)1 &lt;&lt; imm);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Bitwise Operations", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vbitrevi_h (__m128i a, imm0_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vbitrevi_h (__m128i a, imm0_15 imm)\n#include <lsxintrin.h>\nInstruction: vbitrevi.h vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nToggle the bit specified by `imm` from 16-bit elements in `a`, save the result in `dst`.\n\n\n\n\n### Examples\n\n```c++\n__m128i __lsx_vbitrevi_h(__m128i{0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00}, 1)\n= 0x0f0d0f0d0f0d0f0d 0x99a8bbceddecff02\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = a.half[i] ^ ((u16)1 << imm);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vbitrevi_h (__m128i a, imm0_15 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vbitrevi.h vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Toggle the bit specified by <code>imm</code> from 16-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vbitrevi_h(__m128i{0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00}, 1)\n= 0x0f0d0f0d0f0d0f0d 0x99a8bbceddecff02\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = a.half[i] ^ ((u16)1 &lt;&lt; imm);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Bitwise Operations", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vbitrevi_w (__m128i a, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vbitrevi_w (__m128i a, imm0_31 imm)\n#include <lsxintrin.h>\nInstruction: vbitrevi.w vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nToggle the bit specified by `imm` from 32-bit elements in `a`, save the result in `dst`.\n\n\n\n\n### Examples\n\n```c++\n__m128i __lsx_vbitrevi_w(__m128i{0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00}, 1)\n= 0x0f0f0f0d0f0f0f0d 0x99aabbceddeeff02\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = a.word[i] ^ ((u32)1 << imm);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vbitrevi_w (__m128i a, imm0_31 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vbitrevi.w vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Toggle the bit specified by <code>imm</code> from 32-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vbitrevi_w(__m128i{0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00}, 1)\n= 0x0f0f0f0d0f0f0f0d 0x99aabbceddeeff02\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = a.word[i] ^ ((u32)1 &lt;&lt; imm);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Bitwise Operations", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vbitsel_v (__m128i a, __m128i b, __m128i c)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vbitsel_v (__m128i a, __m128i b, __m128i c)\n#include <lsxintrin.h>\nInstruction: vbitsel.v vr, vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompute bitwise selection: for each bit position, if the bit in `c` equals to one, copy the bit from `b` to `dst`, otherwise copy from `a`.\n\n\n\n\n### Examples\n\n```c++\n__m128i __lsx_vbitsel_v(__m128i{0x1122334455667788, 0x99aabbccddeeff00}, __m128i{0xabababababababab, 0x1234123443214321}, __m128i{0xffff0000aaaabbbb, 0x1111222233334444})\n= 0xabab3344ffeeefab 0x98ba9beccfedfb00\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = (c.dword[i] & b.dword[i]) | (~c.dword[i] & a.dword[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 2 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vbitsel_v (__m128i a, __m128i b, __m128i c)\n#include &lt;lsxintrin.h&gt;\nInstruction: vbitsel.v vr, vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute bitwise selection: for each bit position, if the bit in <code>c</code> equals to one, copy the bit from <code>b</code> to <code>dst</code>, otherwise copy from <code>a</code>.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vbitsel_v(__m128i{0x1122334455667788, 0x99aabbccddeeff00}, __m128i{0xabababababababab, 0x1234123443214321}, __m128i{0xffff0000aaaabbbb, 0x1111222233334444})\n= 0xabab3344ffeeefab 0x98ba9beccfedfb00\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = (c.dword[i] &amp; b.dword[i]) | (~c.dword[i] &amp; a.dword[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Bitwise Operations", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vbitseli_b (__m128i a, __m128i b, imm0_255 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vbitseli_b (__m128i a, __m128i b, imm0_255 imm)\n#include <lsxintrin.h>\nInstruction: vbitseli.b vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nCompute bitwise selection: for each bit position, if the bit in `a` equals to one, copy the bit from `imm` to `dst`, otherwise copy from `b`.\n\n\n\n\n### Examples\n\n```c++\n__m128i __lsx_vbitseli_b(__m128i{0x1122334455667788, 0x99aabbccddeeff00}, __m128i{0xabababababababab, 0x1234123443214321}, 0x12)\n= 0xba8b9aabba8b9a23 0x1216123012031221\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = (~a.byte[i] & b.byte[i]) | (a.byte[i] & (u8)imm);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 2 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vbitseli_b (__m128i a, __m128i b, imm0_255 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vbitseli.b vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute bitwise selection: for each bit position, if the bit in <code>a</code> equals to one, copy the bit from <code>imm</code> to <code>dst</code>, otherwise copy from <code>b</code>.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vbitseli_b(__m128i{0x1122334455667788, 0x99aabbccddeeff00}, __m128i{0xabababababababab, 0x1234123443214321}, 0x12)\n= 0xba8b9aabba8b9a23 0x1216123012031221\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = (~a.byte[i] &amp; b.byte[i]) | (a.byte[i] &amp; (u8)imm);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Bitwise Operations", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vbitset_b (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vbitset_b (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vbitset.b vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nSet the bit specified by elements in `b` from 8-bit elements in `a`, save the result in `dst`.\n\n\n\n\n### Examples\n\n```c++\n__m128i __lsx_vbitset_b(__m128i{0x0000000000000000, 0x99aabbccddeeff00}, __m128i{0xabababababababab, 0x1234123443214321})\n= 0x0808080808080808 0x9dbabfdcddeeff02\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = a.byte[i] | ((u8)1 << (b.byte[i] % 8));\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vbitset_b (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vbitset.b vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Set the bit specified by elements in <code>b</code> from 8-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vbitset_b(__m128i{0x0000000000000000, 0x99aabbccddeeff00}, __m128i{0xabababababababab, 0x1234123443214321})\n= 0x0808080808080808 0x9dbabfdcddeeff02\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = a.byte[i] | ((u8)1 &lt;&lt; (b.byte[i] % 8));\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Bitwise Operations", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vbitset_d (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vbitset_d (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vbitset.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nSet the bit specified by elements in `b` from 64-bit elements in `a`, save the result in `dst`.\n\n\n\n\n### Examples\n\n```c++\n__m128i __lsx_vbitset_d(__m128i{0x0000000000000000, 0x99aabbccddeeff00}, __m128i{0xabababababababab, 0x1234123443214321})\n= 0x0000080000000000 0x99aabbceddeeff00\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = a.dword[i] | ((u64)1 << (b.dword[i] % 64));\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vbitset_d (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vbitset.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Set the bit specified by elements in <code>b</code> from 64-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vbitset_d(__m128i{0x0000000000000000, 0x99aabbccddeeff00}, __m128i{0xabababababababab, 0x1234123443214321})\n= 0x0000080000000000 0x99aabbceddeeff00\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = a.dword[i] | ((u64)1 &lt;&lt; (b.dword[i] % 64));\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Bitwise Operations", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vbitset_h (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vbitset_h (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vbitset.h vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nSet the bit specified by elements in `b` from 16-bit elements in `a`, save the result in `dst`.\n\n\n\n\n### Examples\n\n```c++\n__m128i __lsx_vbitset_h(__m128i{0x0000000000000000, 0x99aabbccddeeff00}, __m128i{0xabababababababab, 0x1234123443214321})\n= 0x0800080008000800 0x99babbdcddeeff02\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = a.half[i] | ((u16)1 << (b.half[i] % 16));\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vbitset_h (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vbitset.h vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Set the bit specified by elements in <code>b</code> from 16-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vbitset_h(__m128i{0x0000000000000000, 0x99aabbccddeeff00}, __m128i{0xabababababababab, 0x1234123443214321})\n= 0x0800080008000800 0x99babbdcddeeff02\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = a.half[i] | ((u16)1 &lt;&lt; (b.half[i] % 16));\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Bitwise Operations", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vbitset_w (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vbitset_w (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vbitset.w vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nSet the bit specified by elements in `b` from 32-bit elements in `a`, save the result in `dst`.\n\n\n\n\n### Examples\n\n```c++\n__m128i __lsx_vbitset_w(__m128i{0x0000000000000000, 0x99aabbccddeeff00}, __m128i{0xabababababababab, 0x1234123443214321})\n= 0x0000080000000800 0x99babbccddeeff02\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = a.word[i] | ((u32)1 << (b.word[i] % 32));\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vbitset_w (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vbitset.w vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Set the bit specified by elements in <code>b</code> from 32-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vbitset_w(__m128i{0x0000000000000000, 0x99aabbccddeeff00}, __m128i{0xabababababababab, 0x1234123443214321})\n= 0x0000080000000800 0x99babbccddeeff02\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = a.word[i] | ((u32)1 &lt;&lt; (b.word[i] % 32));\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Bitwise Operations", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vbitseti_b (__m128i a, imm0_7 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vbitseti_b (__m128i a, imm0_7 imm)\n#include <lsxintrin.h>\nInstruction: vbitseti.b vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nSet the bit specified by `imm` from 8-bit elements in `a`, save the result in `dst`.\n\n\n\n\n### Examples\n\n```c++\n__m128i __lsx_vbitseti_b(__m128i{0x0000000000000000, 0x99aabbccddeeff00}, 1)\n= 0x0202020202020202 0x9baabbcedfeeff02\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = a.byte[i] | ((u8)1 << imm);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vbitseti_b (__m128i a, imm0_7 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vbitseti.b vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Set the bit specified by <code>imm</code> from 8-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vbitseti_b(__m128i{0x0000000000000000, 0x99aabbccddeeff00}, 1)\n= 0x0202020202020202 0x9baabbcedfeeff02\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = a.byte[i] | ((u8)1 &lt;&lt; imm);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Bitwise Operations", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vbitseti_d (__m128i a, imm0_63 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vbitseti_d (__m128i a, imm0_63 imm)\n#include <lsxintrin.h>\nInstruction: vbitseti.d vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nSet the bit specified by `imm` from 64-bit elements in `a`, save the result in `dst`.\n\n\n\n\n### Examples\n\n```c++\n__m128i __lsx_vbitseti_d(__m128i{0x0000000000000000, 0x99aabbccddeeff00}, 1)\n= 0x0000000000000002 0x99aabbccddeeff02\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = a.dword[i] | ((u64)1 << imm);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vbitseti_d (__m128i a, imm0_63 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vbitseti.d vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Set the bit specified by <code>imm</code> from 64-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vbitseti_d(__m128i{0x0000000000000000, 0x99aabbccddeeff00}, 1)\n= 0x0000000000000002 0x99aabbccddeeff02\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = a.dword[i] | ((u64)1 &lt;&lt; imm);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Bitwise Operations", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vbitseti_h (__m128i a, imm0_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vbitseti_h (__m128i a, imm0_15 imm)\n#include <lsxintrin.h>\nInstruction: vbitseti.h vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nSet the bit specified by `imm` from 16-bit elements in `a`, save the result in `dst`.\n\n\n\n\n### Examples\n\n```c++\n__m128i __lsx_vbitseti_h(__m128i{0x0000000000000000, 0x99aabbccddeeff00}, 1)\n= 0x0002000200020002 0x99aabbceddeeff02\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = a.half[i] | ((u16)1 << imm);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vbitseti_h (__m128i a, imm0_15 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vbitseti.h vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Set the bit specified by <code>imm</code> from 16-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vbitseti_h(__m128i{0x0000000000000000, 0x99aabbccddeeff00}, 1)\n= 0x0002000200020002 0x99aabbceddeeff02\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = a.half[i] | ((u16)1 &lt;&lt; imm);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Bitwise Operations", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vbitseti_w (__m128i a, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vbitseti_w (__m128i a, imm0_31 imm)\n#include <lsxintrin.h>\nInstruction: vbitseti.w vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nSet the bit specified by `imm` from 32-bit elements in `a`, save the result in `dst`.\n\n\n\n\n### Examples\n\n```c++\n__m128i __lsx_vbitseti_w(__m128i{0x0000000000000000, 0x99aabbccddeeff00}, 1)\n= 0x0000000200000002 0x99aabbceddeeff02\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = a.word[i] | ((u32)1 << imm);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vbitseti_w (__m128i a, imm0_31 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vbitseti.w vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Set the bit specified by <code>imm</code> from 32-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vbitseti_w(__m128i{0x0000000000000000, 0x99aabbccddeeff00}, 1)\n= 0x0000000200000002 0x99aabbceddeeff02\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = a.word[i] | ((u32)1 &lt;&lt; imm);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Bitwise Operations", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vbsll_v (__m128i a, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vbsll_v (__m128i a, imm0_31 imm)\n#include <lsxintrin.h>\nInstruction: vbsll.v vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nCompute whole vector `a` shifted left by `imm * 8` bits.\n\n\n\n\n\n### Operation\n\n```c++\nint shift = (imm * 8) % 128;\ndst.qword[0] = (u128)a.qword[0] << shift;\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vbsll_v (__m128i a, imm0_31 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vbsll.v vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute whole vector <code>a</code> shifted left by <code>imm * 8</code> bits.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">int shift = (imm * 8) % 128;\ndst.qword[0] = (u128)a.qword[0] &lt;&lt; shift;\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vbsrl_v (__m128i a, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vbsrl_v (__m128i a, imm0_31 imm)\n#include <lsxintrin.h>\nInstruction: vbsrl.v vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nCompute whole vector `a` shifted right by `imm * 8` bits.\n\n\n\n\n\n### Operation\n\n```c++\nint shift = (imm * 8) % 128;\ndst.qword[0] = (u128)a.qword[0] >> shift;\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vbsrl_v (__m128i a, imm0_31 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vbsrl.v vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute whole vector <code>a</code> shifted right by <code>imm * 8</code> bits.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">int shift = (imm * 8) % 128;\ndst.qword[0] = (u128)a.qword[0] &gt;&gt; shift;\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vclo_b (__m128i a)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vclo_b (__m128i a)\n#include <lsxintrin.h>\nInstruction: vclo.b vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCount leading ones of 8-bit elements in `a`.\n\n\n\n\n### Examples\n\n```c++\n__m128i __lsx_vclo_b(__m128i{0x1122334455667788, 0x99aabbccddeeff00})\n= 0x0000000000000001 0x0101010202030800\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = clo(a.byte[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vclo_b (__m128i a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vclo.b vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Count leading ones of 8-bit elements in <code>a</code>.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vclo_b(__m128i{0x1122334455667788, 0x99aabbccddeeff00})\n= 0x0000000000000001 0x0101010202030800\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = clo(a.byte[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Bitwise Operations", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vclo_d (__m128i a)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vclo_d (__m128i a)\n#include <lsxintrin.h>\nInstruction: vclo.d vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCount leading ones of 64-bit elements in `a`.\n\n\n\n\n### Examples\n\n```c++\n__m128i __lsx_vclo_d(__m128i{0x1122334455667788, 0x99aabbccddeeff00})\n= 0x0000000000000000 0x0000000000000001\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = clo(a.dword[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vclo_d (__m128i a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vclo.d vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Count leading ones of 64-bit elements in <code>a</code>.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vclo_d(__m128i{0x1122334455667788, 0x99aabbccddeeff00})\n= 0x0000000000000000 0x0000000000000001\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = clo(a.dword[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Bitwise Operations", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vclo_h (__m128i a)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vclo_h (__m128i a)\n#include <lsxintrin.h>\nInstruction: vclo.h vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCount leading ones of 16-bit elements in `a`.\n\n\n\n\n### Examples\n\n```c++\n__m128i __lsx_vclo_h(__m128i{0x1122334455667788, 0x99aabbccddeeff00})\n= 0x0000000000000000 0x0001000100020008\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = clo(a.half[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vclo_h (__m128i a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vclo.h vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Count leading ones of 16-bit elements in <code>a</code>.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vclo_h(__m128i{0x1122334455667788, 0x99aabbccddeeff00})\n= 0x0000000000000000 0x0001000100020008\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = clo(a.half[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Bitwise Operations", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vclo_w (__m128i a)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vclo_w (__m128i a)\n#include <lsxintrin.h>\nInstruction: vclo.w vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCount leading ones of 32-bit elements in `a`.\n\n\n\n\n### Examples\n\n```c++\n__m128i __lsx_vclo_w(__m128i{0x1122334455667788, 0x99aabbccddeeff00})\n= 0x0000000000000000 0x0000000100000002\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = clo(a.word[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vclo_w (__m128i a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vclo.w vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Count leading ones of 32-bit elements in <code>a</code>.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vclo_w(__m128i{0x1122334455667788, 0x99aabbccddeeff00})\n= 0x0000000000000000 0x0000000100000002\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = clo(a.word[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Bitwise Operations", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vclz_b (__m128i a)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vclz_b (__m128i a)\n#include <lsxintrin.h>\nInstruction: vclz.b vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCount leading zeros of 8-bit elements in `a`.\n\n\n\n\n### Examples\n\n```c++\n__m128i __lsx_vclz_b(__m128i{0x1122334455667788, 0x99aabbccddeeff00})\n= 0x0302020101010100 0x0000000000000008\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = clz(a.byte[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vclz_b (__m128i a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vclz.b vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Count leading zeros of 8-bit elements in <code>a</code>.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vclz_b(__m128i{0x1122334455667788, 0x99aabbccddeeff00})\n= 0x0302020101010100 0x0000000000000008\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = clz(a.byte[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Bitwise Operations", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vclz_d (__m128i a)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vclz_d (__m128i a)\n#include <lsxintrin.h>\nInstruction: vclz.d vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCount leading zeros of 64-bit elements in `a`.\n\n\n\n\n### Examples\n\n```c++\n__m128i __lsx_vclz_d(__m128i{0x1122334455667788, 0x99aabbccddeeff00})\n= 0x0000000000000003 0x0000000000000000\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = clz(a.dword[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vclz_d (__m128i a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vclz.d vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Count leading zeros of 64-bit elements in <code>a</code>.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vclz_d(__m128i{0x1122334455667788, 0x99aabbccddeeff00})\n= 0x0000000000000003 0x0000000000000000\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = clz(a.dword[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Bitwise Operations", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vclz_h (__m128i a)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vclz_h (__m128i a)\n#include <lsxintrin.h>\nInstruction: vclz.h vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCount leading zeros of 16-bit elements in `a`.\n\n\n\n\n### Examples\n\n```c++\n__m128i __lsx_vclz_h(__m128i{0x1122334455667788, 0x99aabbccddeeff00})\n= 0x0003000200010001 0x0000000000000000\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = clz(a.half[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vclz_h (__m128i a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vclz.h vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Count leading zeros of 16-bit elements in <code>a</code>.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vclz_h(__m128i{0x1122334455667788, 0x99aabbccddeeff00})\n= 0x0003000200010001 0x0000000000000000\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = clz(a.half[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Bitwise Operations", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vclz_w (__m128i a)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vclz_w (__m128i a)\n#include <lsxintrin.h>\nInstruction: vclz.w vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCount leading zeros of 32-bit elements in `a`.\n\n\n\n\n### Examples\n\n```c++\n__m128i __lsx_vclz_w(__m128i{0x1122334455667788, 0x99aabbccddeeff00})\n= 0x0000000300000001 0x0000000000000000\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = clz(a.word[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vclz_w (__m128i a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vclz.w vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Count leading zeros of 32-bit elements in <code>a</code>.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vclz_w(__m128i{0x1122334455667788, 0x99aabbccddeeff00})\n= 0x0000000300000001 0x0000000000000000\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = clz(a.word[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Bitwise Operations", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vdiv_b (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vdiv_b (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vdiv.b vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nDivide signed 8-bit elements in `a` by elements in `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = (b.byte[i] == 0) ? 0 : ((s8)a.byte[i] / (s8)b.byte[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 29, 32 | 0.06(1/15.5) |\n| 3C5000 | 29, 32 | 0.06(1/17) |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vdiv_b (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vdiv.b vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Divide signed 8-bit elements in <code>a</code> by elements in <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = (b.byte[i] == 0) ? 0 : ((s8)a.byte[i] / (s8)b.byte[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>29, 32</td>\n<td>0.06(1/15.5)</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>29, 32</td>\n<td>0.06(1/17)</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vdiv_bu (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vdiv_bu (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vdiv.bu vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nDivide unsigned 8-bit elements in `a` by elements in `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = (b.byte[i] == 0) ? 0 : ((u8)a.byte[i] / (u8)b.byte[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 29, 33 | 0.06(1/16.5) |\n| 3C5000 | 29, 36 | 0.06(1/18) |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vdiv_bu (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vdiv.bu vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Divide unsigned 8-bit elements in <code>a</code> by elements in <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = (b.byte[i] == 0) ? 0 : ((u8)a.byte[i] / (u8)b.byte[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>29, 33</td>\n<td>0.06(1/16.5)</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>29, 36</td>\n<td>0.06(1/18)</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vdiv_d (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vdiv_d (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vdiv.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nDivide signed 64-bit elements in `a` by elements in `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = (b.dword[i] == 0) ? 0 : ((s64)a.dword[i] / (s64)b.dword[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 8 | 0.25(1/4) |\n| 3C5000 | 8, 18.5 | 0.11(1/9) |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vdiv_d (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vdiv.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Divide signed 64-bit elements in <code>a</code> by elements in <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = (b.dword[i] == 0) ? 0 : ((s64)a.dword[i] / (s64)b.dword[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>8</td>\n<td>0.25(1/4)</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>8, 18.5</td>\n<td>0.11(1/9)</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vdiv_du (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vdiv_du (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vdiv.du vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nDivide unsigned 64-bit elements in `a` by elements in `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = (b.dword[i] == 0) ? 0 : ((u64)a.dword[i] / (u64)b.dword[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 8 | 0.25(1/4) |\n| 3C5000 | 8, 18.5 | 0.11(1/9) |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vdiv_du (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vdiv.du vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Divide unsigned 64-bit elements in <code>a</code> by elements in <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = (b.dword[i] == 0) ? 0 : ((u64)a.dword[i] / (u64)b.dword[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>8</td>\n<td>0.25(1/4)</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>8, 18.5</td>\n<td>0.11(1/9)</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vdiv_h (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vdiv_h (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vdiv.h vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nDivide signed 16-bit elements in `a` by elements in `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = (b.half[i] == 0) ? 0 : ((s16)a.half[i] / (s16)b.half[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 17 | 0.12(1/8.5) |\n| 3C5000 | 17, 21.5 | 0.09(1/11) |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vdiv_h (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vdiv.h vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Divide signed 16-bit elements in <code>a</code> by elements in <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = (b.half[i] == 0) ? 0 : ((s16)a.half[i] / (s16)b.half[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>17</td>\n<td>0.12(1/8.5)</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>17, 21.5</td>\n<td>0.09(1/11)</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vdiv_hu (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vdiv_hu (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vdiv.hu vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nDivide unsigned 16-bit elements in `a` by elements in `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = (b.half[i] == 0) ? 0 : ((u16)a.half[i] / (u16)b.half[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 17, 22 | 0.11(1/9) |\n| 3C5000 | 17, 21.5 | 0.07(1/14) |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vdiv_hu (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vdiv.hu vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Divide unsigned 16-bit elements in <code>a</code> by elements in <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = (b.half[i] == 0) ? 0 : ((u16)a.half[i] / (u16)b.half[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>17, 22</td>\n<td>0.11(1/9)</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>17, 21.5</td>\n<td>0.07(1/14)</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vdiv_w (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vdiv_w (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vdiv.w vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nDivide signed 32-bit elements in `a` by elements in `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = (b.word[i] == 0) ? 0 : ((s32)a.word[i] / (s32)b.word[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 11 | 0.18(1/5.5) |\n| 3C5000 | 11, 17.5 | 0.09(1/11.5) |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vdiv_w (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vdiv.w vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Divide signed 32-bit elements in <code>a</code> by elements in <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = (b.word[i] == 0) ? 0 : ((s32)a.word[i] / (s32)b.word[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>11</td>\n<td>0.18(1/5.5)</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>11, 17.5</td>\n<td>0.09(1/11.5)</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vdiv_wu (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vdiv_wu (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vdiv.wu vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nDivide unsigned 32-bit elements in `a` by elements in `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = (b.word[i] == 0) ? 0 : ((u32)a.word[i] / (u32)b.word[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 11 | 0.18(1/5.5) |\n| 3C5000 | 11, 17.5 | 0.07(1/15) |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vdiv_wu (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vdiv.wu vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Divide unsigned 32-bit elements in <code>a</code> by elements in <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = (b.word[i] == 0) ? 0 : ((u32)a.word[i] / (u32)b.word[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>11</td>\n<td>0.18(1/5.5)</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>11, 17.5</td>\n<td>0.07(1/15)</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vexth_d_w (__m128i a)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vexth_d_w (__m128i a)\n#include <lsxintrin.h>\nInstruction: vexth.d.w vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nExtend signed 32-bit elements in the higher half of `a` to 64-bit.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = (s64)(s32)a.word[2 + i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vexth_d_w (__m128i a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vexth.d.w vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Extend signed 32-bit elements in the higher half of <code>a</code> to 64-bit.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = (s64)(s32)a.word[2 + i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vexth_du_wu (__m128i a)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vexth_du_wu (__m128i a)\n#include <lsxintrin.h>\nInstruction: vexth.du.wu vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nExtend unsigned 32-bit elements in the higher half of `a` to 64-bit.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = (u64)(u32)a.word[2 + i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vexth_du_wu (__m128i a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vexth.du.wu vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Extend unsigned 32-bit elements in the higher half of <code>a</code> to 64-bit.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = (u64)(u32)a.word[2 + i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vexth_h_b (__m128i a)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vexth_h_b (__m128i a)\n#include <lsxintrin.h>\nInstruction: vexth.h.b vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nExtend signed 8-bit elements in the higher half of `a` to 16-bit.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = (s16)(s8)a.byte[8 + i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vexth_h_b (__m128i a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vexth.h.b vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Extend signed 8-bit elements in the higher half of <code>a</code> to 16-bit.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = (s16)(s8)a.byte[8 + i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vexth_hu_bu (__m128i a)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vexth_hu_bu (__m128i a)\n#include <lsxintrin.h>\nInstruction: vexth.hu.bu vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nExtend unsigned 8-bit elements in the higher half of `a` to 16-bit.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = (u16)(u8)a.byte[8 + i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vexth_hu_bu (__m128i a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vexth.hu.bu vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Extend unsigned 8-bit elements in the higher half of <code>a</code> to 16-bit.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = (u16)(u8)a.byte[8 + i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vexth_q_d (__m128i a)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vexth_q_d (__m128i a)\n#include <lsxintrin.h>\nInstruction: vexth.q.d vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nExtend signed 64-bit elements in the higher half of `a` to 128-bit.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 1; i++) {\n  dst.qword[i] = (s128)(s64)a.dword[1 + i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vexth_q_d (__m128i a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vexth.q.d vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Extend signed 64-bit elements in the higher half of <code>a</code> to 128-bit.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 1; i++) {\n  dst.qword[i] = (s128)(s64)a.dword[1 + i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vexth_qu_du (__m128i a)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vexth_qu_du (__m128i a)\n#include <lsxintrin.h>\nInstruction: vexth.qu.du vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nExtend unsigned 64-bit elements in the higher half of `a` to 128-bit.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 1; i++) {\n  dst.qword[i] = (u128)(u64)a.dword[1 + i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vexth_qu_du (__m128i a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vexth.qu.du vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Extend unsigned 64-bit elements in the higher half of <code>a</code> to 128-bit.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 1; i++) {\n  dst.qword[i] = (u128)(u64)a.dword[1 + i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vexth_w_h (__m128i a)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vexth_w_h (__m128i a)\n#include <lsxintrin.h>\nInstruction: vexth.w.h vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nExtend signed 16-bit elements in the higher half of `a` to 32-bit.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = (s32)(s16)a.half[4 + i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vexth_w_h (__m128i a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vexth.w.h vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Extend signed 16-bit elements in the higher half of <code>a</code> to 32-bit.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = (s32)(s16)a.half[4 + i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vexth_wu_hu (__m128i a)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vexth_wu_hu (__m128i a)\n#include <lsxintrin.h>\nInstruction: vexth.wu.hu vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nExtend unsigned 16-bit elements in the higher half of `a` to 32-bit.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = (u32)(u16)a.half[4 + i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vexth_wu_hu (__m128i a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vexth.wu.hu vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Extend unsigned 16-bit elements in the higher half of <code>a</code> to 32-bit.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = (u32)(u16)a.half[4 + i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vextl_q_d (__m128i a)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vextl_q_d (__m128i a)\n#include <lsxintrin.h>\nInstruction: vextl.q.d vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nExtend signed 64-bit elements in the lower half of `a` to 128-bit.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 1; i++) {\n  dst.qword[i] = (s128)(s64)a.dword[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vextl_q_d (__m128i a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vextl.q.d vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Extend signed 64-bit elements in the lower half of <code>a</code> to 128-bit.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 1; i++) {\n  dst.qword[i] = (s128)(s64)a.dword[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vextl_qu_du (__m128i a)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vextl_qu_du (__m128i a)\n#include <lsxintrin.h>\nInstruction: vextl.qu.du vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nExtend unsigned 64-bit elements in the lower half of `a` to 128-bit.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 1; i++) {\n  dst.qword[i] = (u128)(u64)a.dword[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vextl_qu_du (__m128i a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vextl.qu.du vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Extend unsigned 64-bit elements in the lower half of <code>a</code> to 128-bit.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 1; i++) {\n  dst.qword[i] = (u128)(u64)a.dword[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vextrins_b (__m128i a, __m128i b, imm0_255 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vextrins_b (__m128i a, __m128i b, imm0_255 imm)\n#include <lsxintrin.h>\nInstruction: vextrins.b vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nExtract one 8-bit element in `b` and insert it to `a` according to `imm`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = (i == ((imm >> 4) & 15)) ? b.byte[imm & 15] : a.byte[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vextrins_b (__m128i a, __m128i b, imm0_255 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vextrins.b vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Extract one 8-bit element in <code>b</code> and insert it to <code>a</code> according to <code>imm</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = (i == ((imm &gt;&gt; 4) &amp; 15)) ? b.byte[imm &amp; 15] : a.byte[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vextrins_d (__m128i a, __m128i b, imm0_255 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vextrins_d (__m128i a, __m128i b, imm0_255 imm)\n#include <lsxintrin.h>\nInstruction: vextrins.d vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nExtract one 64-bit element in `b` and insert it to `a` according to `imm`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = (i == ((imm >> 4) & 1)) ? b.dword[imm & 1] : a.dword[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vextrins_d (__m128i a, __m128i b, imm0_255 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vextrins.d vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Extract one 64-bit element in <code>b</code> and insert it to <code>a</code> according to <code>imm</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = (i == ((imm &gt;&gt; 4) &amp; 1)) ? b.dword[imm &amp; 1] : a.dword[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vextrins_h (__m128i a, __m128i b, imm0_255 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vextrins_h (__m128i a, __m128i b, imm0_255 imm)\n#include <lsxintrin.h>\nInstruction: vextrins.h vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nExtract one 16-bit element in `b` and insert it to `a` according to `imm`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = (i == ((imm >> 4) & 7)) ? b.half[imm & 7] : a.half[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vextrins_h (__m128i a, __m128i b, imm0_255 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vextrins.h vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Extract one 16-bit element in <code>b</code> and insert it to <code>a</code> according to <code>imm</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = (i == ((imm &gt;&gt; 4) &amp; 7)) ? b.half[imm &amp; 7] : a.half[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vextrins_w (__m128i a, __m128i b, imm0_255 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vextrins_w (__m128i a, __m128i b, imm0_255 imm)\n#include <lsxintrin.h>\nInstruction: vextrins.w vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nExtract one 32-bit element in `b` and insert it to `a` according to `imm`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = (i == ((imm >> 4) & 3)) ? b.word[imm & 3] : a.word[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vextrins_w (__m128i a, __m128i b, imm0_255 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vextrins.w vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Extract one 32-bit element in <code>b</code> and insert it to <code>a</code> according to <code>imm</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = (i == ((imm &gt;&gt; 4) &amp; 3)) ? b.word[imm &amp; 3] : a.word[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vfclass_d (__m128d a)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vfclass_d (__m128d a)\n#include <lsxintrin.h>\nInstruction: vfclass.d vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nClassifiy each double precision floating point elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = fp_classify(a.fp64[i]);\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vfclass_d (__m128d a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfclass.d vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Classifiy each double precision floating point elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = fp_classify(a.fp64[i]);\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Misc", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vfclass_s (__m128 a)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vfclass_s (__m128 a)\n#include <lsxintrin.h>\nInstruction: vfclass.s vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nClassifiy each single precision floating point elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = fp_classify(a.fp32[i]);\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vfclass_s (__m128 a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfclass.s vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Classifiy each single precision floating point elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = fp_classify(a.fp32[i]);\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Misc", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vfcmp_caf_d (__m128d a, __m128d b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vfcmp_caf_d (__m128d a, __m128d b)\n#include <lsxintrin.h>\nInstruction: vfcmp.caf.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompare double precision elements in `a` and `b`, save the comparison result (all ones if AF(Always False), all zeros otherwise) into `dst`. Do not trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  if (fp_compare_caf(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vfcmp_caf_d (__m128d a, __m128d b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfcmp.caf.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if AF(Always False), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  if (fp_compare_caf(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vfcmp_caf_s (__m128 a, __m128 b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vfcmp_caf_s (__m128 a, __m128 b)\n#include <lsxintrin.h>\nInstruction: vfcmp.caf.s vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompare single precision elements in `a` and `b`, save the comparison result (all ones if AF(Always False), all zeros otherwise) into `dst`. Do not trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (fp_compare_caf(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vfcmp_caf_s (__m128 a, __m128 b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfcmp.caf.s vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if AF(Always False), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (fp_compare_caf(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vfcmp_ceq_d (__m128d a, __m128d b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vfcmp_ceq_d (__m128d a, __m128d b)\n#include <lsxintrin.h>\nInstruction: vfcmp.ceq.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompare double precision elements in `a` and `b`, save the comparison result (all ones if EQ(Equal), all zeros otherwise) into `dst`. Do not trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  if (fp_compare_ceq(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vfcmp_ceq_d (__m128d a, __m128d b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfcmp.ceq.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if EQ(Equal), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  if (fp_compare_ceq(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vfcmp_ceq_s (__m128 a, __m128 b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vfcmp_ceq_s (__m128 a, __m128 b)\n#include <lsxintrin.h>\nInstruction: vfcmp.ceq.s vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompare single precision elements in `a` and `b`, save the comparison result (all ones if EQ(Equal), all zeros otherwise) into `dst`. Do not trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (fp_compare_ceq(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vfcmp_ceq_s (__m128 a, __m128 b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfcmp.ceq.s vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if EQ(Equal), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (fp_compare_ceq(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vfcmp_cle_d (__m128d a, __m128d b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vfcmp_cle_d (__m128d a, __m128d b)\n#include <lsxintrin.h>\nInstruction: vfcmp.cle.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompare double precision elements in `a` and `b`, save the comparison result (all ones if LE(Less than or Equal), all zeros otherwise) into `dst`. Do not trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  if (fp_compare_cle(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vfcmp_cle_d (__m128d a, __m128d b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfcmp.cle.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if LE(Less than or Equal), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  if (fp_compare_cle(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vfcmp_cle_s (__m128 a, __m128 b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vfcmp_cle_s (__m128 a, __m128 b)\n#include <lsxintrin.h>\nInstruction: vfcmp.cle.s vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompare single precision elements in `a` and `b`, save the comparison result (all ones if LE(Less than or Equal), all zeros otherwise) into `dst`. Do not trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (fp_compare_cle(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vfcmp_cle_s (__m128 a, __m128 b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfcmp.cle.s vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if LE(Less than or Equal), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (fp_compare_cle(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vfcmp_clt_d (__m128d a, __m128d b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vfcmp_clt_d (__m128d a, __m128d b)\n#include <lsxintrin.h>\nInstruction: vfcmp.clt.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompare double precision elements in `a` and `b`, save the comparison result (all ones if LT(Less than), all zeros otherwise) into `dst`. Do not trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  if (fp_compare_clt(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vfcmp_clt_d (__m128d a, __m128d b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfcmp.clt.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if LT(Less than), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  if (fp_compare_clt(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vfcmp_clt_s (__m128 a, __m128 b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vfcmp_clt_s (__m128 a, __m128 b)\n#include <lsxintrin.h>\nInstruction: vfcmp.clt.s vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompare single precision elements in `a` and `b`, save the comparison result (all ones if LT(Less than), all zeros otherwise) into `dst`. Do not trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (fp_compare_clt(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vfcmp_clt_s (__m128 a, __m128 b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfcmp.clt.s vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if LT(Less than), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (fp_compare_clt(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vfcmp_cne_d (__m128d a, __m128d b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vfcmp_cne_d (__m128d a, __m128d b)\n#include <lsxintrin.h>\nInstruction: vfcmp.cne.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompare double precision elements in `a` and `b`, save the comparison result (all ones if NE(Not Equal), all zeros otherwise) into `dst`. Do not trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  if (fp_compare_cne(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vfcmp_cne_d (__m128d a, __m128d b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfcmp.cne.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if NE(Not Equal), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  if (fp_compare_cne(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vfcmp_cne_s (__m128 a, __m128 b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vfcmp_cne_s (__m128 a, __m128 b)\n#include <lsxintrin.h>\nInstruction: vfcmp.cne.s vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompare single precision elements in `a` and `b`, save the comparison result (all ones if NE(Not Equal), all zeros otherwise) into `dst`. Do not trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (fp_compare_cne(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vfcmp_cne_s (__m128 a, __m128 b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfcmp.cne.s vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if NE(Not Equal), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (fp_compare_cne(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vfcmp_cor_d (__m128d a, __m128d b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vfcmp_cor_d (__m128d a, __m128d b)\n#include <lsxintrin.h>\nInstruction: vfcmp.cor.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompare double precision elements in `a` and `b`, save the comparison result (all ones if OR(Ordered), all zeros otherwise) into `dst`. Do not trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  if (fp_compare_cor(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vfcmp_cor_d (__m128d a, __m128d b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfcmp.cor.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if OR(Ordered), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  if (fp_compare_cor(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vfcmp_cor_s (__m128 a, __m128 b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vfcmp_cor_s (__m128 a, __m128 b)\n#include <lsxintrin.h>\nInstruction: vfcmp.cor.s vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompare single precision elements in `a` and `b`, save the comparison result (all ones if OR(Ordered), all zeros otherwise) into `dst`. Do not trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (fp_compare_cor(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vfcmp_cor_s (__m128 a, __m128 b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfcmp.cor.s vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if OR(Ordered), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (fp_compare_cor(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vfcmp_cueq_d (__m128d a, __m128d b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vfcmp_cueq_d (__m128d a, __m128d b)\n#include <lsxintrin.h>\nInstruction: vfcmp.cueq.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompare double precision elements in `a` and `b`, save the comparison result (all ones if UEQ(Unordered or Equal), all zeros otherwise) into `dst`. Do not trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  if (fp_compare_cueq(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vfcmp_cueq_d (__m128d a, __m128d b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfcmp.cueq.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if UEQ(Unordered or Equal), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  if (fp_compare_cueq(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vfcmp_cueq_s (__m128 a, __m128 b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vfcmp_cueq_s (__m128 a, __m128 b)\n#include <lsxintrin.h>\nInstruction: vfcmp.cueq.s vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompare single precision elements in `a` and `b`, save the comparison result (all ones if UEQ(Unordered or Equal), all zeros otherwise) into `dst`. Do not trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (fp_compare_cueq(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vfcmp_cueq_s (__m128 a, __m128 b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfcmp.cueq.s vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if UEQ(Unordered or Equal), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (fp_compare_cueq(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vfcmp_cule_d (__m128d a, __m128d b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vfcmp_cule_d (__m128d a, __m128d b)\n#include <lsxintrin.h>\nInstruction: vfcmp.cule.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompare double precision elements in `a` and `b`, save the comparison result (all ones if ULE(Unordered, Less than or Equal), all zeros otherwise) into `dst`. Do not trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  if (fp_compare_cule(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vfcmp_cule_d (__m128d a, __m128d b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfcmp.cule.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if ULE(Unordered, Less than or Equal), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  if (fp_compare_cule(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vfcmp_cule_s (__m128 a, __m128 b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vfcmp_cule_s (__m128 a, __m128 b)\n#include <lsxintrin.h>\nInstruction: vfcmp.cule.s vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompare single precision elements in `a` and `b`, save the comparison result (all ones if ULE(Unordered, Less than or Equal), all zeros otherwise) into `dst`. Do not trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (fp_compare_cule(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vfcmp_cule_s (__m128 a, __m128 b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfcmp.cule.s vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if ULE(Unordered, Less than or Equal), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (fp_compare_cule(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vfcmp_cult_d (__m128d a, __m128d b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vfcmp_cult_d (__m128d a, __m128d b)\n#include <lsxintrin.h>\nInstruction: vfcmp.cult.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompare double precision elements in `a` and `b`, save the comparison result (all ones if ULT(Unordered or Less than), all zeros otherwise) into `dst`. Do not trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  if (fp_compare_cult(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vfcmp_cult_d (__m128d a, __m128d b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfcmp.cult.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if ULT(Unordered or Less than), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  if (fp_compare_cult(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vfcmp_cult_s (__m128 a, __m128 b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vfcmp_cult_s (__m128 a, __m128 b)\n#include <lsxintrin.h>\nInstruction: vfcmp.cult.s vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompare single precision elements in `a` and `b`, save the comparison result (all ones if ULT(Unordered or Less than), all zeros otherwise) into `dst`. Do not trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (fp_compare_cult(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vfcmp_cult_s (__m128 a, __m128 b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfcmp.cult.s vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if ULT(Unordered or Less than), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (fp_compare_cult(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vfcmp_cun_d (__m128d a, __m128d b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vfcmp_cun_d (__m128d a, __m128d b)\n#include <lsxintrin.h>\nInstruction: vfcmp.cun.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompare double precision elements in `a` and `b`, save the comparison result (all ones if UN(Unordered), all zeros otherwise) into `dst`. Do not trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  if (fp_compare_cun(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vfcmp_cun_d (__m128d a, __m128d b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfcmp.cun.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if UN(Unordered), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  if (fp_compare_cun(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vfcmp_cun_s (__m128 a, __m128 b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vfcmp_cun_s (__m128 a, __m128 b)\n#include <lsxintrin.h>\nInstruction: vfcmp.cun.s vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompare single precision elements in `a` and `b`, save the comparison result (all ones if UN(Unordered), all zeros otherwise) into `dst`. Do not trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (fp_compare_cun(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vfcmp_cun_s (__m128 a, __m128 b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfcmp.cun.s vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if UN(Unordered), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (fp_compare_cun(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vfcmp_cune_d (__m128d a, __m128d b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vfcmp_cune_d (__m128d a, __m128d b)\n#include <lsxintrin.h>\nInstruction: vfcmp.cune.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompare double precision elements in `a` and `b`, save the comparison result (all ones if UNE(Unordered or Not Equal), all zeros otherwise) into `dst`. Do not trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  if (fp_compare_cune(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vfcmp_cune_d (__m128d a, __m128d b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfcmp.cune.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if UNE(Unordered or Not Equal), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  if (fp_compare_cune(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vfcmp_cune_s (__m128 a, __m128 b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vfcmp_cune_s (__m128 a, __m128 b)\n#include <lsxintrin.h>\nInstruction: vfcmp.cune.s vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompare single precision elements in `a` and `b`, save the comparison result (all ones if UNE(Unordered or Not Equal), all zeros otherwise) into `dst`. Do not trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (fp_compare_cune(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vfcmp_cune_s (__m128 a, __m128 b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfcmp.cune.s vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if UNE(Unordered or Not Equal), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (fp_compare_cune(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vfcmp_saf_d (__m128d a, __m128d b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vfcmp_saf_d (__m128d a, __m128d b)\n#include <lsxintrin.h>\nInstruction: vfcmp.saf.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompare double precision elements in `a` and `b`, save the comparison result (all ones if AF(Always False), all zeros otherwise) into `dst`. Trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  if (fp_compare_saf(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vfcmp_saf_d (__m128d a, __m128d b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfcmp.saf.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if AF(Always False), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  if (fp_compare_saf(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vfcmp_saf_s (__m128 a, __m128 b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vfcmp_saf_s (__m128 a, __m128 b)\n#include <lsxintrin.h>\nInstruction: vfcmp.saf.s vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompare single precision elements in `a` and `b`, save the comparison result (all ones if AF(Always False), all zeros otherwise) into `dst`. Trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (fp_compare_saf(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vfcmp_saf_s (__m128 a, __m128 b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfcmp.saf.s vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if AF(Always False), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (fp_compare_saf(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vfcmp_seq_d (__m128d a, __m128d b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vfcmp_seq_d (__m128d a, __m128d b)\n#include <lsxintrin.h>\nInstruction: vfcmp.seq.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompare double precision elements in `a` and `b`, save the comparison result (all ones if EQ(Equal), all zeros otherwise) into `dst`. Trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  if (fp_compare_seq(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vfcmp_seq_d (__m128d a, __m128d b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfcmp.seq.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if EQ(Equal), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  if (fp_compare_seq(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vfcmp_seq_s (__m128 a, __m128 b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vfcmp_seq_s (__m128 a, __m128 b)\n#include <lsxintrin.h>\nInstruction: vfcmp.seq.s vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompare single precision elements in `a` and `b`, save the comparison result (all ones if EQ(Equal), all zeros otherwise) into `dst`. Trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (fp_compare_seq(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vfcmp_seq_s (__m128 a, __m128 b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfcmp.seq.s vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if EQ(Equal), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (fp_compare_seq(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vfcmp_sle_d (__m128d a, __m128d b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vfcmp_sle_d (__m128d a, __m128d b)\n#include <lsxintrin.h>\nInstruction: vfcmp.sle.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompare double precision elements in `a` and `b`, save the comparison result (all ones if LE(Less than or Equal), all zeros otherwise) into `dst`. Trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  if (fp_compare_sle(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vfcmp_sle_d (__m128d a, __m128d b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfcmp.sle.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if LE(Less than or Equal), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  if (fp_compare_sle(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vfcmp_sle_s (__m128 a, __m128 b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vfcmp_sle_s (__m128 a, __m128 b)\n#include <lsxintrin.h>\nInstruction: vfcmp.sle.s vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompare single precision elements in `a` and `b`, save the comparison result (all ones if LE(Less than or Equal), all zeros otherwise) into `dst`. Trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (fp_compare_sle(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vfcmp_sle_s (__m128 a, __m128 b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfcmp.sle.s vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if LE(Less than or Equal), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (fp_compare_sle(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vfcmp_slt_d (__m128d a, __m128d b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vfcmp_slt_d (__m128d a, __m128d b)\n#include <lsxintrin.h>\nInstruction: vfcmp.slt.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompare double precision elements in `a` and `b`, save the comparison result (all ones if LT(Less than), all zeros otherwise) into `dst`. Trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  if (fp_compare_slt(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vfcmp_slt_d (__m128d a, __m128d b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfcmp.slt.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if LT(Less than), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  if (fp_compare_slt(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vfcmp_slt_s (__m128 a, __m128 b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vfcmp_slt_s (__m128 a, __m128 b)\n#include <lsxintrin.h>\nInstruction: vfcmp.slt.s vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompare single precision elements in `a` and `b`, save the comparison result (all ones if LT(Less than), all zeros otherwise) into `dst`. Trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (fp_compare_slt(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vfcmp_slt_s (__m128 a, __m128 b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfcmp.slt.s vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if LT(Less than), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (fp_compare_slt(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vfcmp_sne_d (__m128d a, __m128d b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vfcmp_sne_d (__m128d a, __m128d b)\n#include <lsxintrin.h>\nInstruction: vfcmp.sne.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompare double precision elements in `a` and `b`, save the comparison result (all ones if NE(Not Equal), all zeros otherwise) into `dst`. Trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  if (fp_compare_sne(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vfcmp_sne_d (__m128d a, __m128d b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfcmp.sne.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if NE(Not Equal), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  if (fp_compare_sne(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vfcmp_sne_s (__m128 a, __m128 b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vfcmp_sne_s (__m128 a, __m128 b)\n#include <lsxintrin.h>\nInstruction: vfcmp.sne.s vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompare single precision elements in `a` and `b`, save the comparison result (all ones if NE(Not Equal), all zeros otherwise) into `dst`. Trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (fp_compare_sne(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vfcmp_sne_s (__m128 a, __m128 b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfcmp.sne.s vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if NE(Not Equal), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (fp_compare_sne(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vfcmp_sor_d (__m128d a, __m128d b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vfcmp_sor_d (__m128d a, __m128d b)\n#include <lsxintrin.h>\nInstruction: vfcmp.sor.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompare double precision elements in `a` and `b`, save the comparison result (all ones if OR(Ordered), all zeros otherwise) into `dst`. Trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  if (fp_compare_sor(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vfcmp_sor_d (__m128d a, __m128d b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfcmp.sor.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if OR(Ordered), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  if (fp_compare_sor(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vfcmp_sor_s (__m128 a, __m128 b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vfcmp_sor_s (__m128 a, __m128 b)\n#include <lsxintrin.h>\nInstruction: vfcmp.sor.s vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompare single precision elements in `a` and `b`, save the comparison result (all ones if OR(Ordered), all zeros otherwise) into `dst`. Trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (fp_compare_sor(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vfcmp_sor_s (__m128 a, __m128 b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfcmp.sor.s vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if OR(Ordered), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (fp_compare_sor(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vfcmp_sueq_d (__m128d a, __m128d b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vfcmp_sueq_d (__m128d a, __m128d b)\n#include <lsxintrin.h>\nInstruction: vfcmp.sueq.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompare double precision elements in `a` and `b`, save the comparison result (all ones if UEQ(Unordered or Equal), all zeros otherwise) into `dst`. Trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  if (fp_compare_sueq(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vfcmp_sueq_d (__m128d a, __m128d b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfcmp.sueq.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if UEQ(Unordered or Equal), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  if (fp_compare_sueq(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vfcmp_sueq_s (__m128 a, __m128 b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vfcmp_sueq_s (__m128 a, __m128 b)\n#include <lsxintrin.h>\nInstruction: vfcmp.sueq.s vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompare single precision elements in `a` and `b`, save the comparison result (all ones if UEQ(Unordered or Equal), all zeros otherwise) into `dst`. Trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (fp_compare_sueq(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vfcmp_sueq_s (__m128 a, __m128 b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfcmp.sueq.s vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if UEQ(Unordered or Equal), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (fp_compare_sueq(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vfcmp_sule_d (__m128d a, __m128d b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vfcmp_sule_d (__m128d a, __m128d b)\n#include <lsxintrin.h>\nInstruction: vfcmp.sule.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompare double precision elements in `a` and `b`, save the comparison result (all ones if ULE(Unordered, Less than or Equal), all zeros otherwise) into `dst`. Trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  if (fp_compare_sule(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vfcmp_sule_d (__m128d a, __m128d b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfcmp.sule.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if ULE(Unordered, Less than or Equal), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  if (fp_compare_sule(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vfcmp_sule_s (__m128 a, __m128 b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vfcmp_sule_s (__m128 a, __m128 b)\n#include <lsxintrin.h>\nInstruction: vfcmp.sule.s vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompare single precision elements in `a` and `b`, save the comparison result (all ones if ULE(Unordered, Less than or Equal), all zeros otherwise) into `dst`. Trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (fp_compare_sule(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vfcmp_sule_s (__m128 a, __m128 b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfcmp.sule.s vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if ULE(Unordered, Less than or Equal), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (fp_compare_sule(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vfcmp_sult_d (__m128d a, __m128d b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vfcmp_sult_d (__m128d a, __m128d b)\n#include <lsxintrin.h>\nInstruction: vfcmp.sult.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompare double precision elements in `a` and `b`, save the comparison result (all ones if ULT(Unordered or Less than), all zeros otherwise) into `dst`. Trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  if (fp_compare_sult(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vfcmp_sult_d (__m128d a, __m128d b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfcmp.sult.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if ULT(Unordered or Less than), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  if (fp_compare_sult(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vfcmp_sult_s (__m128 a, __m128 b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vfcmp_sult_s (__m128 a, __m128 b)\n#include <lsxintrin.h>\nInstruction: vfcmp.sult.s vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompare single precision elements in `a` and `b`, save the comparison result (all ones if ULT(Unordered or Less than), all zeros otherwise) into `dst`. Trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (fp_compare_sult(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vfcmp_sult_s (__m128 a, __m128 b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfcmp.sult.s vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if ULT(Unordered or Less than), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (fp_compare_sult(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vfcmp_sun_d (__m128d a, __m128d b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vfcmp_sun_d (__m128d a, __m128d b)\n#include <lsxintrin.h>\nInstruction: vfcmp.sun.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompare double precision elements in `a` and `b`, save the comparison result (all ones if UN(Unordered), all zeros otherwise) into `dst`. Trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  if (fp_compare_sun(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vfcmp_sun_d (__m128d a, __m128d b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfcmp.sun.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if UN(Unordered), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  if (fp_compare_sun(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vfcmp_sun_s (__m128 a, __m128 b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vfcmp_sun_s (__m128 a, __m128 b)\n#include <lsxintrin.h>\nInstruction: vfcmp.sun.s vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompare single precision elements in `a` and `b`, save the comparison result (all ones if UN(Unordered), all zeros otherwise) into `dst`. Trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (fp_compare_sun(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vfcmp_sun_s (__m128 a, __m128 b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfcmp.sun.s vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if UN(Unordered), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (fp_compare_sun(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vfcmp_sune_d (__m128d a, __m128d b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vfcmp_sune_d (__m128d a, __m128d b)\n#include <lsxintrin.h>\nInstruction: vfcmp.sune.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompare double precision elements in `a` and `b`, save the comparison result (all ones if UNE(Unordered or Not Equal), all zeros otherwise) into `dst`. Trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  if (fp_compare_sune(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vfcmp_sune_d (__m128d a, __m128d b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfcmp.sune.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if UNE(Unordered or Not Equal), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  if (fp_compare_sune(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vfcmp_sune_s (__m128 a, __m128 b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vfcmp_sune_s (__m128 a, __m128 b)\n#include <lsxintrin.h>\nInstruction: vfcmp.sune.s vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompare single precision elements in `a` and `b`, save the comparison result (all ones if UNE(Unordered or Not Equal), all zeros otherwise) into `dst`. Trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (fp_compare_sune(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vfcmp_sune_s (__m128 a, __m128 b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfcmp.sune.s vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if UNE(Unordered or Not Equal), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (fp_compare_sune(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vfcvt_h_s (__m128 a, __m128 b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vfcvt_h_s (__m128 a, __m128 b)\n#include <lsxintrin.h>\nInstruction: vfcvt.h.s vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nConvert single precision floating point elements in `a` and `b` to half precision.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  if (i < 4) {\n    dst.fp16[i] = b.fp32[i];\n  } else {\n    dst.fp16[i] = a.fp32[i - 4];\n  }\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vfcvt_h_s (__m128 a, __m128 b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfcvt.h.s vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Convert single precision floating point elements in <code>a</code> and <code>b</code> to half precision.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  if (i &lt; 4) {\n    dst.fp16[i] = b.fp32[i];\n  } else {\n    dst.fp16[i] = a.fp32[i - 4];\n  }\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Conversion", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vfrstp_b (__m128i a, __m128i b, __m128i c)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vfrstp_b (__m128i a, __m128i b, __m128i c)\n#include <lsxintrin.h>\nInstruction: vfrstp.b vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nFind the first negative 8-bit element in `b`, set the index of the element to the lane of `a` specified by `c`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = a.byte[i];\n}\nint i;\nfor (i = 0; i < 16; i++) {\n  if ((s8)b.byte[i] < 0) {\n    break;\n  }\n}\ndst.byte[c.byte[0] % 16] = i;\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vfrstp_b (__m128i a, __m128i b, __m128i c)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfrstp.b vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Find the first negative 8-bit element in <code>b</code>, set the index of the element to the lane of <code>a</code> specified by <code>c</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = a.byte[i];\n}\nint i;\nfor (i = 0; i &lt; 16; i++) {\n  if ((s8)b.byte[i] &lt; 0) {\n    break;\n  }\n}\ndst.byte[c.byte[0] % 16] = i;\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vfrstp_h (__m128i a, __m128i b, __m128i c)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vfrstp_h (__m128i a, __m128i b, __m128i c)\n#include <lsxintrin.h>\nInstruction: vfrstp.h vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nFind the first negative 16-bit element in `b`, set the index of the element to the lane of `a` specified by `c`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = a.half[i];\n}\nint i;\nfor (i = 0; i < 8; i++) {\n  if ((s16)b.half[i] < 0) {\n    break;\n  }\n}\ndst.half[c.half[0] % 8] = i;\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vfrstp_h (__m128i a, __m128i b, __m128i c)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfrstp.h vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Find the first negative 16-bit element in <code>b</code>, set the index of the element to the lane of <code>a</code> specified by <code>c</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = a.half[i];\n}\nint i;\nfor (i = 0; i &lt; 8; i++) {\n  if ((s16)b.half[i] &lt; 0) {\n    break;\n  }\n}\ndst.half[c.half[0] % 8] = i;\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vfrstpi_b (__m128i a, __m128i b, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vfrstpi_b (__m128i a, __m128i b, imm0_31 imm)\n#include <lsxintrin.h>\nInstruction: vfrstpi.b vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nFind the first negative 8-bit element in `b`, set the index of the element to the lane of `a` specified by `imm`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = a.byte[i];\n}\nint i;\nfor (i = 0; i < 16; i++) {\n  if ((s8)b.byte[i] < 0) {\n    break;\n  }\n}\ndst.byte[imm % 16] = i;\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vfrstpi_b (__m128i a, __m128i b, imm0_31 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfrstpi.b vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Find the first negative 8-bit element in <code>b</code>, set the index of the element to the lane of <code>a</code> specified by <code>imm</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = a.byte[i];\n}\nint i;\nfor (i = 0; i &lt; 16; i++) {\n  if ((s8)b.byte[i] &lt; 0) {\n    break;\n  }\n}\ndst.byte[imm % 16] = i;\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vfrstpi_h (__m128i a, __m128i b, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vfrstpi_h (__m128i a, __m128i b, imm0_31 imm)\n#include <lsxintrin.h>\nInstruction: vfrstpi.h vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nFind the first negative 16-bit element in `b`, set the index of the element to the lane of `a` specified by `imm`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = a.half[i];\n}\nint i;\nfor (i = 0; i < 8; i++) {\n  if ((s16)b.half[i] < 0) {\n    break;\n  }\n}\ndst.half[imm % 8] = i;\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vfrstpi_h (__m128i a, __m128i b, imm0_31 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vfrstpi.h vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Find the first negative 16-bit element in <code>b</code>, set the index of the element to the lane of <code>a</code> specified by <code>imm</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = a.half[i];\n}\nint i;\nfor (i = 0; i &lt; 8; i++) {\n  if ((s16)b.half[i] &lt; 0) {\n    break;\n  }\n}\ndst.half[imm % 8] = i;\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vftint_l_d (__m128d a)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vftint_l_d (__m128d a)\n#include <lsxintrin.h>\nInstruction: vftint.l.d vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nConvert double-precision floating point elements in `a` to signed 64-bit integer, using current rounding mode specified in `fscr`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = (s64)a.fp64[i]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 4 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vftint_l_d (__m128d a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vftint.l.d vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Convert double-precision floating point elements in <code>a</code> to signed 64-bit integer, using current rounding mode specified in <code>fscr</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = (s64)a.fp64[i]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Conversion", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vftint_lu_d (__m128d a)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vftint_lu_d (__m128d a)\n#include <lsxintrin.h>\nInstruction: vftint.lu.d vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nConvert double-precision floating point elements in `a` to unsigned 64-bit integer, using current rounding mode specified in `fscr`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = (u64)a.fp64[i]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 4 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vftint_lu_d (__m128d a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vftint.lu.d vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Convert double-precision floating point elements in <code>a</code> to unsigned 64-bit integer, using current rounding mode specified in <code>fscr</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = (u64)a.fp64[i]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Conversion", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vftint_w_d (__m128d a, __m128d b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vftint_w_d (__m128d a, __m128d b)\n#include <lsxintrin.h>\nInstruction: vftint.w.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nConvert double-precision floating point elements in `a` and `b` to 32-bit integer, using current rounding mode specified in `fscr`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = (i < 1)\n                     ? (s64)a.fp64[i]\n                     : (s64)b.fp64[i]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 5 | 2 |\n| 3C5000 | 5 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vftint_w_d (__m128d a, __m128d b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vftint.w.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Convert double-precision floating point elements in <code>a</code> and <code>b</code> to 32-bit integer, using current rounding mode specified in <code>fscr</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = (i &lt; 1)\n                     ? (s64)a.fp64[i]\n                     : (s64)b.fp64[i]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>5</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>5</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Conversion", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vftint_w_s (__m128 a)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vftint_w_s (__m128 a)\n#include <lsxintrin.h>\nInstruction: vftint.w.s vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nConvert single-precision floating point elements in `a` to signed 32-bit integer, using current rounding mode specified in `fscr`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 4 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vftint_w_s (__m128 a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vftint.w.s vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Convert single-precision floating point elements in <code>a</code> to signed 32-bit integer, using current rounding mode specified in <code>fscr</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Conversion", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vftint_wu_s (__m128 a)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vftint_wu_s (__m128 a)\n#include <lsxintrin.h>\nInstruction: vftint.wu.s vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nConvert single-precision floating point elements in `a` to unsigned 32-bit integer, using current rounding mode specified in `fscr`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = (u32)a.fp32[i]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 4 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vftint_wu_s (__m128 a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vftint.wu.s vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Convert single-precision floating point elements in <code>a</code> to unsigned 32-bit integer, using current rounding mode specified in <code>fscr</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = (u32)a.fp32[i]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Conversion", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vftinth_l_s (__m128 a)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vftinth_l_s (__m128 a)\n#include <lsxintrin.h>\nInstruction: vftinth.l.s vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nConvert single-precision floating point elements in higher part of `a` to 64-bit integer, using current rounding mode specified in `fscr`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = (s32)a.fp32[i + 2]; // rounding mode is not expressed in C\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 5 | 2 |\n| 3C5000 | 5 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vftinth_l_s (__m128 a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vftinth.l.s vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Convert single-precision floating point elements in higher part of <code>a</code> to 64-bit integer, using current rounding mode specified in <code>fscr</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = (s32)a.fp32[i + 2]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>5</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>5</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Conversion", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vftintl_l_s (__m128 a)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vftintl_l_s (__m128 a)\n#include <lsxintrin.h>\nInstruction: vftintl.l.s vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nConvert single-precision floating point elements in lower part of `a` to 64-bit integer, using current rounding mode specified in `fscr`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 5 | 2 |\n| 3C5000 | 5 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vftintl_l_s (__m128 a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vftintl.l.s vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Convert single-precision floating point elements in lower part of <code>a</code> to 64-bit integer, using current rounding mode specified in <code>fscr</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>5</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>5</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Conversion", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vftintrm_l_d (__m128d a)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vftintrm_l_d (__m128d a)\n#include <lsxintrin.h>\nInstruction: vftintrm.l.d vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nConvert double-precision floating point elements in `a` to signed 64-bit integer, rounding towards negative infinity.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = (s64)a.fp64[i]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 4 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vftintrm_l_d (__m128d a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vftintrm.l.d vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Convert double-precision floating point elements in <code>a</code> to signed 64-bit integer, rounding towards negative infinity.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = (s64)a.fp64[i]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Conversion", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vftintrm_w_d (__m128d a, __m128d b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vftintrm_w_d (__m128d a, __m128d b)\n#include <lsxintrin.h>\nInstruction: vftintrm.w.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nConvert double-precision floating point elements in `a` and `b` to 32-bit integer, rounding towards negative infinity.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = (i < 1)\n                     ? (s64)a.fp64[i]\n                     : (s64)b.fp64[i]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 5 | 2 |\n| 3C5000 | 5 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vftintrm_w_d (__m128d a, __m128d b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vftintrm.w.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Convert double-precision floating point elements in <code>a</code> and <code>b</code> to 32-bit integer, rounding towards negative infinity.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = (i &lt; 1)\n                     ? (s64)a.fp64[i]\n                     : (s64)b.fp64[i]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>5</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>5</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Conversion", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vftintrm_w_s (__m128 a)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vftintrm_w_s (__m128 a)\n#include <lsxintrin.h>\nInstruction: vftintrm.w.s vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nConvert single-precision floating point elements in `a` to signed 32-bit integer, rounding towards negative infinity.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 4 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vftintrm_w_s (__m128 a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vftintrm.w.s vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Convert single-precision floating point elements in <code>a</code> to signed 32-bit integer, rounding towards negative infinity.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Conversion", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vftintrmh_l_s (__m128 a)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vftintrmh_l_s (__m128 a)\n#include <lsxintrin.h>\nInstruction: vftintrmh.l.s vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nConvert single-precision floating point elements in higher part of `a` to 64-bit integer, rounding towards negative infinity.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = (s32)a.fp32[i + 2]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 5 | 2 |\n| 3C5000 | 5 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vftintrmh_l_s (__m128 a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vftintrmh.l.s vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Convert single-precision floating point elements in higher part of <code>a</code> to 64-bit integer, rounding towards negative infinity.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = (s32)a.fp32[i + 2]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>5</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>5</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Conversion", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vftintrml_l_s (__m128 a)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vftintrml_l_s (__m128 a)\n#include <lsxintrin.h>\nInstruction: vftintrml.l.s vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nConvert single-precision floating point elements in lower part of `a` to 64-bit integer, rounding towards negative infinity.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 5 | 2 |\n| 3C5000 | 5 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vftintrml_l_s (__m128 a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vftintrml.l.s vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Convert single-precision floating point elements in lower part of <code>a</code> to 64-bit integer, rounding towards negative infinity.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>5</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>5</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Conversion", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vftintrne_l_d (__m128d a)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vftintrne_l_d (__m128d a)\n#include <lsxintrin.h>\nInstruction: vftintrne.l.d vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nConvert double-precision floating point elements in `a` to signed 64-bit integer, rounding towards nearest even.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = (s64)a.fp64[i]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 4 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vftintrne_l_d (__m128d a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vftintrne.l.d vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Convert double-precision floating point elements in <code>a</code> to signed 64-bit integer, rounding towards nearest even.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = (s64)a.fp64[i]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Conversion", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vftintrne_w_d (__m128d a, __m128d b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vftintrne_w_d (__m128d a, __m128d b)\n#include <lsxintrin.h>\nInstruction: vftintrne.w.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nConvert double-precision floating point elements in `a` and `b` to 32-bit integer, rounding towards nearest even.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = (i < 1)\n                     ? (s64)a.fp64[i]\n                     : (s64)b.fp64[i]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 5 | 2 |\n| 3C5000 | 5 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vftintrne_w_d (__m128d a, __m128d b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vftintrne.w.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Convert double-precision floating point elements in <code>a</code> and <code>b</code> to 32-bit integer, rounding towards nearest even.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = (i &lt; 1)\n                     ? (s64)a.fp64[i]\n                     : (s64)b.fp64[i]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>5</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>5</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Conversion", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vftintrne_w_s (__m128 a)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vftintrne_w_s (__m128 a)\n#include <lsxintrin.h>\nInstruction: vftintrne.w.s vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nConvert single-precision floating point elements in `a` to signed 32-bit integer, rounding towards nearest even.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 4 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vftintrne_w_s (__m128 a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vftintrne.w.s vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Convert single-precision floating point elements in <code>a</code> to signed 32-bit integer, rounding towards nearest even.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Conversion", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vftintrneh_l_s (__m128 a)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vftintrneh_l_s (__m128 a)\n#include <lsxintrin.h>\nInstruction: vftintrneh.l.s vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nConvert single-precision floating point elements in higher part of `a` to 64-bit integer, rounding towards nearest even.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = (s32)a.fp32[i + 2]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 5 | 2 |\n| 3C5000 | 5 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vftintrneh_l_s (__m128 a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vftintrneh.l.s vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Convert single-precision floating point elements in higher part of <code>a</code> to 64-bit integer, rounding towards nearest even.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = (s32)a.fp32[i + 2]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>5</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>5</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Conversion", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vftintrnel_l_s (__m128 a)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vftintrnel_l_s (__m128 a)\n#include <lsxintrin.h>\nInstruction: vftintrnel.l.s vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nConvert single-precision floating point elements in lower part of `a` to 64-bit integer, rounding towards nearest even.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 5 | 2 |\n| 3C5000 | 5 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vftintrnel_l_s (__m128 a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vftintrnel.l.s vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Convert single-precision floating point elements in lower part of <code>a</code> to 64-bit integer, rounding towards nearest even.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>5</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>5</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Conversion", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vftintrp_l_d (__m128d a)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vftintrp_l_d (__m128d a)\n#include <lsxintrin.h>\nInstruction: vftintrp.l.d vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nConvert double-precision floating point elements in `a` to signed 64-bit integer, rounding towards positive infinity.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = (s64)a.fp64[i]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 4 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vftintrp_l_d (__m128d a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vftintrp.l.d vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Convert double-precision floating point elements in <code>a</code> to signed 64-bit integer, rounding towards positive infinity.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = (s64)a.fp64[i]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Conversion", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vftintrp_w_d (__m128d a, __m128d b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vftintrp_w_d (__m128d a, __m128d b)\n#include <lsxintrin.h>\nInstruction: vftintrp.w.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nConvert double-precision floating point elements in `a` and `b` to 32-bit integer, rounding towards positive infinity.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = (i < 1)\n                     ? (s64)a.fp64[i]\n                     : (s64)b.fp64[i]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 5 | 2 |\n| 3C5000 | 5 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vftintrp_w_d (__m128d a, __m128d b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vftintrp.w.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Convert double-precision floating point elements in <code>a</code> and <code>b</code> to 32-bit integer, rounding towards positive infinity.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = (i &lt; 1)\n                     ? (s64)a.fp64[i]\n                     : (s64)b.fp64[i]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>5</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>5</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Conversion", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vftintrp_w_s (__m128 a)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vftintrp_w_s (__m128 a)\n#include <lsxintrin.h>\nInstruction: vftintrp.w.s vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nConvert single-precision floating point elements in `a` to signed 32-bit integer, rounding towards positive infinity.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 4 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vftintrp_w_s (__m128 a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vftintrp.w.s vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Convert single-precision floating point elements in <code>a</code> to signed 32-bit integer, rounding towards positive infinity.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Conversion", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vftintrph_l_s (__m128 a)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vftintrph_l_s (__m128 a)\n#include <lsxintrin.h>\nInstruction: vftintrph.l.s vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nConvert single-precision floating point elements in higher part of `a` to 64-bit integer, rounding towards positive infinity.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = (s32)a.fp32[i + 2]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 5 | 2 |\n| 3C5000 | 5 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vftintrph_l_s (__m128 a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vftintrph.l.s vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Convert single-precision floating point elements in higher part of <code>a</code> to 64-bit integer, rounding towards positive infinity.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = (s32)a.fp32[i + 2]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>5</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>5</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Conversion", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vftintrpl_l_s (__m128 a)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vftintrpl_l_s (__m128 a)\n#include <lsxintrin.h>\nInstruction: vftintrpl.l.s vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nConvert single-precision floating point elements in lower part of `a` to 64-bit integer, rounding towards positive infinity.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 5 | 2 |\n| 3C5000 | 5 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vftintrpl_l_s (__m128 a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vftintrpl.l.s vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Convert single-precision floating point elements in lower part of <code>a</code> to 64-bit integer, rounding towards positive infinity.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>5</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>5</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Conversion", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vftintrz_l_d (__m128d a)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vftintrz_l_d (__m128d a)\n#include <lsxintrin.h>\nInstruction: vftintrz.l.d vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nConvert double-precision floating point elements in `a` to signed 64-bit integer, rounding towards zero.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = (s64)a.fp64[i]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 4 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vftintrz_l_d (__m128d a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vftintrz.l.d vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Convert double-precision floating point elements in <code>a</code> to signed 64-bit integer, rounding towards zero.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = (s64)a.fp64[i]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Conversion", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vftintrz_lu_d (__m128d a)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vftintrz_lu_d (__m128d a)\n#include <lsxintrin.h>\nInstruction: vftintrz.lu.d vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nConvert double-precision floating point elements in `a` to unsigned 64-bit integer, rounding towards zero.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = (u64)a.fp64[i]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 4 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vftintrz_lu_d (__m128d a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vftintrz.lu.d vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Convert double-precision floating point elements in <code>a</code> to unsigned 64-bit integer, rounding towards zero.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = (u64)a.fp64[i]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Conversion", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vftintrz_w_d (__m128d a, __m128d b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vftintrz_w_d (__m128d a, __m128d b)\n#include <lsxintrin.h>\nInstruction: vftintrz.w.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nConvert double-precision floating point elements in `a` and `b` to 32-bit integer, rounding towards zero.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = (i < 1)\n                     ? (s64)a.fp64[i]\n                     : (s64)b.fp64[i]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 5 | 2 |\n| 3C5000 | 5 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vftintrz_w_d (__m128d a, __m128d b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vftintrz.w.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Convert double-precision floating point elements in <code>a</code> and <code>b</code> to 32-bit integer, rounding towards zero.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = (i &lt; 1)\n                     ? (s64)a.fp64[i]\n                     : (s64)b.fp64[i]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>5</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>5</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Conversion", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vftintrz_w_s (__m128 a)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vftintrz_w_s (__m128 a)\n#include <lsxintrin.h>\nInstruction: vftintrz.w.s vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nConvert single-precision floating point elements in `a` to signed 32-bit integer, rounding towards zero.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 4 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vftintrz_w_s (__m128 a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vftintrz.w.s vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Convert single-precision floating point elements in <code>a</code> to signed 32-bit integer, rounding towards zero.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Conversion", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vftintrz_wu_s (__m128 a)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vftintrz_wu_s (__m128 a)\n#include <lsxintrin.h>\nInstruction: vftintrz.wu.s vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nConvert single-precision floating point elements in `a` to unsigned 32-bit integer, rounding towards zero.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = (u32)a.fp32[i]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 4 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vftintrz_wu_s (__m128 a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vftintrz.wu.s vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Convert single-precision floating point elements in <code>a</code> to unsigned 32-bit integer, rounding towards zero.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = (u32)a.fp32[i]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Conversion", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vftintrzh_l_s (__m128 a)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vftintrzh_l_s (__m128 a)\n#include <lsxintrin.h>\nInstruction: vftintrzh.l.s vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nConvert single-precision floating point elements in higher part of `a` to 64-bit integer, rounding towards zero.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = (s32)a.fp32[i + 2]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 5 | 2 |\n| 3C5000 | 5 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vftintrzh_l_s (__m128 a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vftintrzh.l.s vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Convert single-precision floating point elements in higher part of <code>a</code> to 64-bit integer, rounding towards zero.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = (s32)a.fp32[i + 2]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>5</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>5</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Conversion", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vftintrzl_l_s (__m128 a)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vftintrzl_l_s (__m128 a)\n#include <lsxintrin.h>\nInstruction: vftintrzl.l.s vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nConvert single-precision floating point elements in lower part of `a` to 64-bit integer, rounding towards zero.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 5 | 2 |\n| 3C5000 | 5 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vftintrzl_l_s (__m128 a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vftintrzl.l.s vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Convert single-precision floating point elements in lower part of <code>a</code> to 64-bit integer, rounding towards zero.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>5</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>5</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Conversion", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vhaddw_d_w (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vhaddw_d_w (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vhaddw.d.w vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nAdd odd-positioned signed 32-bit elements in `a` to even-positioned signed 32-bit elements in `b` to get 64-bit result.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = (s64)(s32)a.word[2 * i + 1] + (s64)(s32)b.word[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vhaddw_d_w (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vhaddw.d.w vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add odd-positioned signed 32-bit elements in <code>a</code> to even-positioned signed 32-bit elements in <code>b</code> to get 64-bit result.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = (s64)(s32)a.word[2 * i + 1] + (s64)(s32)b.word[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vhaddw_du_wu (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vhaddw_du_wu (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vhaddw.du.wu vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nAdd odd-positioned unsigned 32-bit elements in `a` to even-positioned unsigned 32-bit elements in `b` to get 64-bit result.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = (u64)(u32)a.word[2 * i + 1] + (u64)(u32)b.word[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vhaddw_du_wu (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vhaddw.du.wu vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add odd-positioned unsigned 32-bit elements in <code>a</code> to even-positioned unsigned 32-bit elements in <code>b</code> to get 64-bit result.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = (u64)(u32)a.word[2 * i + 1] + (u64)(u32)b.word[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vhaddw_h_b (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vhaddw_h_b (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vhaddw.h.b vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nAdd odd-positioned signed 8-bit elements in `a` to even-positioned signed 8-bit elements in `b` to get 16-bit result.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = (s16)(s8)a.byte[2 * i + 1] + (s16)(s8)b.byte[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vhaddw_h_b (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vhaddw.h.b vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add odd-positioned signed 8-bit elements in <code>a</code> to even-positioned signed 8-bit elements in <code>b</code> to get 16-bit result.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = (s16)(s8)a.byte[2 * i + 1] + (s16)(s8)b.byte[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vhaddw_hu_bu (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vhaddw_hu_bu (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vhaddw.hu.bu vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nAdd odd-positioned unsigned 8-bit elements in `a` to even-positioned unsigned 8-bit elements in `b` to get 16-bit result.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = (u16)(u8)a.byte[2 * i + 1] + (u16)(u8)b.byte[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vhaddw_hu_bu (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vhaddw.hu.bu vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add odd-positioned unsigned 8-bit elements in <code>a</code> to even-positioned unsigned 8-bit elements in <code>b</code> to get 16-bit result.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = (u16)(u8)a.byte[2 * i + 1] + (u16)(u8)b.byte[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vhaddw_q_d (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vhaddw_q_d (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vhaddw.q.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nAdd odd-positioned signed 64-bit elements in `a` to even-positioned signed 64-bit elements in `b` to get 128-bit result.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 1; i++) {\n  dst.qword[i] = (s128)(s64)a.dword[2 * i + 1] + (s128)(s64)b.dword[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vhaddw_q_d (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vhaddw.q.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add odd-positioned signed 64-bit elements in <code>a</code> to even-positioned signed 64-bit elements in <code>b</code> to get 128-bit result.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 1; i++) {\n  dst.qword[i] = (s128)(s64)a.dword[2 * i + 1] + (s128)(s64)b.dword[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vhaddw_qu_du (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vhaddw_qu_du (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vhaddw.qu.du vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nAdd odd-positioned unsigned 64-bit elements in `a` to even-positioned unsigned 64-bit elements in `b` to get 128-bit result.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 1; i++) {\n  dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] + (u128)(u64)b.dword[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vhaddw_qu_du (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vhaddw.qu.du vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add odd-positioned unsigned 64-bit elements in <code>a</code> to even-positioned unsigned 64-bit elements in <code>b</code> to get 128-bit result.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 1; i++) {\n  dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] + (u128)(u64)b.dword[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vhaddw_w_h (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vhaddw_w_h (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vhaddw.w.h vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nAdd odd-positioned signed 16-bit elements in `a` to even-positioned signed 16-bit elements in `b` to get 32-bit result.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = (s32)(s16)a.half[2 * i + 1] + (s32)(s16)b.half[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vhaddw_w_h (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vhaddw.w.h vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add odd-positioned signed 16-bit elements in <code>a</code> to even-positioned signed 16-bit elements in <code>b</code> to get 32-bit result.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = (s32)(s16)a.half[2 * i + 1] + (s32)(s16)b.half[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vhaddw_wu_hu (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vhaddw_wu_hu (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vhaddw.wu.hu vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nAdd odd-positioned unsigned 16-bit elements in `a` to even-positioned unsigned 16-bit elements in `b` to get 32-bit result.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = (u32)(u16)a.half[2 * i + 1] + (u32)(u16)b.half[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vhaddw_wu_hu (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vhaddw.wu.hu vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add odd-positioned unsigned 16-bit elements in <code>a</code> to even-positioned unsigned 16-bit elements in <code>b</code> to get 32-bit result.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = (u32)(u16)a.half[2 * i + 1] + (u32)(u16)b.half[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vhsubw_d_w (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vhsubw_d_w (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vhsubw.d.w vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nSubtract odd-positioned signed 32-bit elements in `a` by even-positioned signed 32-bit elements in `b` to get 64-bit result.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = (s64)(s32)a.word[2 * i + 1] - (s64)(s32)b.word[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vhsubw_d_w (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vhsubw.d.w vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Subtract odd-positioned signed 32-bit elements in <code>a</code> by even-positioned signed 32-bit elements in <code>b</code> to get 64-bit result.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = (s64)(s32)a.word[2 * i + 1] - (s64)(s32)b.word[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vhsubw_du_wu (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vhsubw_du_wu (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vhsubw.du.wu vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nSubtract odd-positioned unsigned 32-bit elements in `a` by even-positioned unsigned 32-bit elements in `b` to get 64-bit result.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = (u64)(u32)a.word[2 * i + 1] - (u64)(u32)b.word[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vhsubw_du_wu (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vhsubw.du.wu vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Subtract odd-positioned unsigned 32-bit elements in <code>a</code> by even-positioned unsigned 32-bit elements in <code>b</code> to get 64-bit result.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = (u64)(u32)a.word[2 * i + 1] - (u64)(u32)b.word[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vhsubw_h_b (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vhsubw_h_b (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vhsubw.h.b vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nSubtract odd-positioned signed 8-bit elements in `a` by even-positioned signed 8-bit elements in `b` to get 16-bit result.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = (s16)(s8)a.byte[2 * i + 1] - (s16)(s8)b.byte[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vhsubw_h_b (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vhsubw.h.b vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Subtract odd-positioned signed 8-bit elements in <code>a</code> by even-positioned signed 8-bit elements in <code>b</code> to get 16-bit result.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = (s16)(s8)a.byte[2 * i + 1] - (s16)(s8)b.byte[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vhsubw_hu_bu (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vhsubw_hu_bu (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vhsubw.hu.bu vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nSubtract odd-positioned unsigned 8-bit elements in `a` by even-positioned unsigned 8-bit elements in `b` to get 16-bit result.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = (u16)(u8)a.byte[2 * i + 1] - (u16)(u8)b.byte[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vhsubw_hu_bu (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vhsubw.hu.bu vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Subtract odd-positioned unsigned 8-bit elements in <code>a</code> by even-positioned unsigned 8-bit elements in <code>b</code> to get 16-bit result.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = (u16)(u8)a.byte[2 * i + 1] - (u16)(u8)b.byte[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vhsubw_q_d (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vhsubw_q_d (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vhsubw.q.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nSubtract odd-positioned signed 64-bit elements in `a` by even-positioned signed 64-bit elements in `b` to get 128-bit result.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 1; i++) {\n  dst.qword[i] = (s128)(s64)a.dword[2 * i + 1] - (s128)(s64)b.dword[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vhsubw_q_d (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vhsubw.q.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Subtract odd-positioned signed 64-bit elements in <code>a</code> by even-positioned signed 64-bit elements in <code>b</code> to get 128-bit result.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 1; i++) {\n  dst.qword[i] = (s128)(s64)a.dword[2 * i + 1] - (s128)(s64)b.dword[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vhsubw_qu_du (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vhsubw_qu_du (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vhsubw.qu.du vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nSubtract odd-positioned unsigned 64-bit elements in `a` by even-positioned unsigned 64-bit elements in `b` to get 128-bit result.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 1; i++) {\n  dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] - (u128)(u64)b.dword[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vhsubw_qu_du (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vhsubw.qu.du vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Subtract odd-positioned unsigned 64-bit elements in <code>a</code> by even-positioned unsigned 64-bit elements in <code>b</code> to get 128-bit result.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 1; i++) {\n  dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] - (u128)(u64)b.dword[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vhsubw_w_h (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vhsubw_w_h (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vhsubw.w.h vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nSubtract odd-positioned signed 16-bit elements in `a` by even-positioned signed 16-bit elements in `b` to get 32-bit result.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = (s32)(s16)a.half[2 * i + 1] - (s32)(s16)b.half[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vhsubw_w_h (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vhsubw.w.h vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Subtract odd-positioned signed 16-bit elements in <code>a</code> by even-positioned signed 16-bit elements in <code>b</code> to get 32-bit result.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = (s32)(s16)a.half[2 * i + 1] - (s32)(s16)b.half[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vhsubw_wu_hu (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vhsubw_wu_hu (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vhsubw.wu.hu vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nSubtract odd-positioned unsigned 16-bit elements in `a` by even-positioned unsigned 16-bit elements in `b` to get 32-bit result.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = (u32)(u16)a.half[2 * i + 1] - (u32)(u16)b.half[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vhsubw_wu_hu (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vhsubw.wu.hu vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Subtract odd-positioned unsigned 16-bit elements in <code>a</code> by even-positioned unsigned 16-bit elements in <code>b</code> to get 32-bit result.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = (u32)(u16)a.half[2 * i + 1] - (u32)(u16)b.half[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vilvh_b (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vilvh_b (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vilvh.b vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nInterleave 8-bit elements in higher half of `a` and `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = (i % 2 == 1) ? a.byte[i / 2 + 8] : b.byte[i / 2 + 8];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vilvh_b (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vilvh.b vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Interleave 8-bit elements in higher half of <code>a</code> and <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = (i % 2 == 1) ? a.byte[i / 2 + 8] : b.byte[i / 2 + 8];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vilvh_d (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vilvh_d (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vilvh.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nInterleave 64-bit elements in higher half of `a` and `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = (i % 2 == 1) ? a.dword[i / 2 + 1] : b.dword[i / 2 + 1];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vilvh_d (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vilvh.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Interleave 64-bit elements in higher half of <code>a</code> and <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = (i % 2 == 1) ? a.dword[i / 2 + 1] : b.dword[i / 2 + 1];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vilvh_h (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vilvh_h (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vilvh.h vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nInterleave 16-bit elements in higher half of `a` and `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = (i % 2 == 1) ? a.half[i / 2 + 4] : b.half[i / 2 + 4];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vilvh_h (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vilvh.h vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Interleave 16-bit elements in higher half of <code>a</code> and <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = (i % 2 == 1) ? a.half[i / 2 + 4] : b.half[i / 2 + 4];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vilvh_w (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vilvh_w (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vilvh.w vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nInterleave 32-bit elements in higher half of `a` and `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = (i % 2 == 1) ? a.word[i / 2 + 2] : b.word[i / 2 + 2];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vilvh_w (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vilvh.w vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Interleave 32-bit elements in higher half of <code>a</code> and <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = (i % 2 == 1) ? a.word[i / 2 + 2] : b.word[i / 2 + 2];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vilvl_b (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vilvl_b (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vilvl.b vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nInterleave 8-bit elements in lower half of `a` and `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = (i % 2 == 1) ? a.byte[i / 2] : b.byte[i / 2];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vilvl_b (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vilvl.b vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Interleave 8-bit elements in lower half of <code>a</code> and <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = (i % 2 == 1) ? a.byte[i / 2] : b.byte[i / 2];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vilvl_d (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vilvl_d (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vilvl.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nInterleave 64-bit elements in lower half of `a` and `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = (i % 2 == 1) ? a.dword[i / 2] : b.dword[i / 2];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vilvl_d (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vilvl.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Interleave 64-bit elements in lower half of <code>a</code> and <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = (i % 2 == 1) ? a.dword[i / 2] : b.dword[i / 2];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vilvl_h (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vilvl_h (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vilvl.h vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nInterleave 16-bit elements in lower half of `a` and `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = (i % 2 == 1) ? a.half[i / 2] : b.half[i / 2];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vilvl_h (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vilvl.h vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Interleave 16-bit elements in lower half of <code>a</code> and <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = (i % 2 == 1) ? a.half[i / 2] : b.half[i / 2];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vilvl_w (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vilvl_w (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vilvl.w vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nInterleave 32-bit elements in lower half of `a` and `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = (i % 2 == 1) ? a.word[i / 2] : b.word[i / 2];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vilvl_w (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vilvl.w vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Interleave 32-bit elements in lower half of <code>a</code> and <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = (i % 2 == 1) ? a.word[i / 2] : b.word[i / 2];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vinsgr2vr_b (__m128i a, int b, imm0_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vinsgr2vr_b (__m128i a, int b, imm0_15 imm)\n#include <lsxintrin.h>\nInstruction: vinsgr2vr.b vr, r, imm\nCPU Flags: LSX\n```\n\n### Description\n\nInsert 8-bit element into lane indexed `imm`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = (i == imm) ? b : a.byte[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 1 |\n| 3C5000 | 1 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vinsgr2vr_b (__m128i a, int b, imm0_15 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vinsgr2vr.b vr, r, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Insert 8-bit element into lane indexed <code>imm</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = (i == imm) ? b : a.byte[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>1</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vinsgr2vr_d (__m128i a, long int b, imm0_1 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vinsgr2vr_d (__m128i a, long int b, imm0_1 imm)\n#include <lsxintrin.h>\nInstruction: vinsgr2vr.d vr, r, imm\nCPU Flags: LSX\n```\n\n### Description\n\nInsert 64-bit element into lane indexed `imm`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = (i == imm) ? b : a.dword[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 1 |\n| 3C5000 | 1 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vinsgr2vr_d (__m128i a, long int b, imm0_1 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vinsgr2vr.d vr, r, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Insert 64-bit element into lane indexed <code>imm</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = (i == imm) ? b : a.dword[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>1</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vinsgr2vr_h (__m128i a, int b, imm0_7 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vinsgr2vr_h (__m128i a, int b, imm0_7 imm)\n#include <lsxintrin.h>\nInstruction: vinsgr2vr.h vr, r, imm\nCPU Flags: LSX\n```\n\n### Description\n\nInsert 16-bit element into lane indexed `imm`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = (i == imm) ? b : a.half[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 1 |\n| 3C5000 | 1 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vinsgr2vr_h (__m128i a, int b, imm0_7 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vinsgr2vr.h vr, r, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Insert 16-bit element into lane indexed <code>imm</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = (i == imm) ? b : a.half[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>1</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vinsgr2vr_w (__m128i a, int b, imm0_3 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vinsgr2vr_w (__m128i a, int b, imm0_3 imm)\n#include <lsxintrin.h>\nInstruction: vinsgr2vr.w vr, r, imm\nCPU Flags: LSX\n```\n\n### Description\n\nInsert 32-bit element into lane indexed `imm`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = (i == imm) ? b : a.word[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 1 |\n| 3C5000 | 1 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vinsgr2vr_w (__m128i a, int b, imm0_3 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vinsgr2vr.w vr, r, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Insert 32-bit element into lane indexed <code>imm</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = (i == imm) ? b : a.word[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>1</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vld (void * addr, imm_n2048_2047 offset)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vld (void * addr, imm_n2048_2047 offset)\n#include <lsxintrin.h>\nInstruction: vld vr, r, imm\nCPU Flags: LSX\n```\n\n### Description\n\nRead whole vector from memory address `addr + offset`, save the data into `dst`. Note that you can use this intrinsic to load floating point vectors, even though the return type represents integer vectors.\n\n\n\n\n\n### Operation\n\n```c++\ndst = memory_load(128, addr + offset);\n```", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vld (void * addr, imm_n2048_2047 offset)\n#include &lt;lsxintrin.h&gt;\nInstruction: vld vr, r, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Read whole vector from memory address <code>addr + offset</code>, save the data into <code>dst</code>. Note that you can use this intrinsic to load floating point vectors, even though the return type represents integer vectors.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">dst = memory_load(128, addr + offset);\n</code></pre>", "group": "Memory Load & Store", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vldi (imm_n1024_1023 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vldi (imm_n1024_1023 imm)\n#include <lsxintrin.h>\nInstruction: vldi vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\n\nInitialize `dst` using predefined patterns:\n\n- `imm[12:10]=0b000`: broadcast `imm[7:0]` as 8-bit elements to all lanes\n- `imm[12:10]=0b001`: broadcast sign-extended `imm[9:0]` as 16-bit elements to all lanes\n- `imm[12:10]=0b010`: broadcast sign-extended `imm[9:0]` as 32-bit elements to all lanes\n- `imm[12:10]=0b011`: broadcast sign-extended `imm[9:0]` as 64-bit elements to all lanes\n- `imm[12:8]=0b10000`: broadcast `imm[7:0]` as 32-bit elements to all lanes\n- `imm[12:8]=0b10001`: broadcast `imm[7:0] << 8` as 32-bit elements to all lanes\n- `imm[12:8]=0b10010`: broadcast `imm[7:0] << 16` as 32-bit elements to all lanes\n- `imm[12:8]=0b10011`: broadcast `imm[7:0] << 24` as 32-bit elements to all lanes\n- `imm[12:8]=0b10100`: broadcast `imm[7:0]` as 16-bit elements to all lanes\n- `imm[12:8]=0b10101`: broadcast `imm[7:0] << 8` as 16-bit elements to all lanes\n- `imm[12:8]=0b10110`: broadcast `(imm[7:0] << 8) | 0xFF` as 32-bit elements to all lanes\n- `imm[12:8]=0b10111`: broadcast `(imm[7:0] << 16) | 0xFFFF` as 32-bit elements to all lanes\n- `imm[12:8]=0b11000`: broadcast `imm[7:0]` as 8-bit elements to all lanes\n- `imm[12:8]=0b11001`: repeat each bit of `imm[7:0]` eight times, and broadcast the result as 64-bit elements to all lanes\n- `imm[12:8]=0b11010`: broadcast `(imm[7] << 31) | ((1-imm[6]) << 30) | ((imm[6] * 0x1F) << 25) | (imm[5:0] << 19)` as 32-bit elements to all lanes\n- `imm[12:8]=0b11011`: broadcast `(imm[7] << 31) | ((1-imm[6]) << 30) | ((imm[6] * 0x1F) << 25) | (imm[5:0] << 19)` as 64-bit elements to all lanes\n- `imm[12:8]=0b11100`: broadcast `(imm[7] << 63) | ((1-imm[6]) << 62) | ((imm[6] * 0xFF) << 54) | (imm[5:0] << 48)` as 64-bit elements to all lanes\n\n\n\n\n\n\n### Operation\n\n```c++\nu64 imm12_10 = (imm >> 10) & 0b111;\nu64 imm12_8 = (imm >> 8) & 0b11111;\nu64 imm9_0 = imm & 0x3FF;\ns64 simm9_0 = ((s64)imm9_0 << 54) >> 54;\nu64 imm7_0 = imm & 0xFF;\nu64 imm7 = (imm >> 7) & 0x1;\nu64 imm6 = (imm >> 6) & 0x1;\nu64 imm5 = (imm >> 5) & 0x1;\nu64 imm5_0 = imm & 0x3F;\nu64 imm4 = (imm >> 4) & 0x1;\nu64 imm3 = (imm >> 3) & 0x1;\nu64 imm2 = (imm >> 2) & 0x1;\nu64 imm1 = (imm >> 1) & 0x1;\nu64 imm0 = imm & 0x1;\n\nu64 broadcast_value;\nu64 broadcast_width;\nif (imm12_10 == 0b000) {\n  broadcast_value = imm7_0;\n  broadcast_width = 8;\n} else if (imm12_10 == 0b001) {\n  broadcast_value = simm9_0;\n  broadcast_width = 16;\n} else if (imm12_10 == 0b010) {\n  broadcast_value = simm9_0;\n  broadcast_width = 32;\n} else if (imm12_10 == 0b011) {\n  broadcast_value = simm9_0;\n  broadcast_width = 64;\n} else if (imm12_8 == 0b10000) {\n  broadcast_value = imm7_0;\n  broadcast_width = 32;\n} else if (imm12_8 == 0b10001) {\n  broadcast_value = imm7_0 << 8;\n  broadcast_width = 32;\n} else if (imm12_8 == 0b10010) {\n  broadcast_value = imm7_0 << 16;\n  broadcast_width = 32;\n} else if (imm12_8 == 0b10011) {\n  broadcast_value = imm7_0 << 24;\n  broadcast_width = 32;\n} else if (imm12_8 == 0b10100) {\n  broadcast_value = imm7_0;\n  broadcast_width = 16;\n} else if (imm12_8 == 0b10101) {\n  broadcast_value = imm7_0 << 8;\n  broadcast_width = 16;\n} else if (imm12_8 == 0b10110) {\n  broadcast_value = (imm7_0 << 8) | 0xFF;\n  broadcast_width = 32;\n} else if (imm12_8 == 0b10111) {\n  broadcast_value = (imm7_0 << 16) | 0xFFFF;\n  broadcast_width = 32;\n} else if (imm12_8 == 0b11000) {\n  broadcast_value = imm7_0;\n  broadcast_width = 8;\n} else if (imm12_8 == 0b11001) {\n  broadcast_value = imm0 * 0xFF + imm1 * 0xFF00 + imm2 * 0xFF0000 +\n                    imm3 * 0xFF000000 + imm4 * 0xFF00000000 +\n                    imm5 * 0xFF0000000000 + imm6 * 0xFF000000000000 +\n                    imm7 * 0xFF00000000000000;\n  broadcast_width = 64;\n} else if (imm12_8 == 0b11010) {\n  broadcast_value = (imm7 << 31) | ((1 - imm6) << 30) | ((imm6 * 0x1F) << 25) |\n                    (imm5_0 << 19);\n  broadcast_width = 32;\n} else if (imm12_8 == 0b11011) {\n  broadcast_value = (imm7 << 31) | ((1 - imm6) << 30) | ((imm6 * 0x1F) << 25) |\n                    (imm5_0 << 19);\n  broadcast_width = 64;\n} else if (imm12_8 == 0b11100) {\n  broadcast_value = (imm7 << 63) | ((1 - imm6) << 62) | ((imm6 * 0xFF) << 54) |\n                    (imm5_0 << 48);\n  broadcast_width = 64;\n}\n\nif (broadcast_width == 8) {\n  for (int i = 0; i < 16; i++) {\n    dst.byte[i] = broadcast_value;\n  }\n} else if (broadcast_width == 16) {\n  for (int i = 0; i < 8; i++) {\n    dst.half[i] = broadcast_value;\n  }\n} else if (broadcast_width == 32) {\n  for (int i = 0; i < 4; i++) {\n    dst.word[i] = broadcast_value;\n  }\n} else if (broadcast_width == 64) {\n  for (int i = 0; i < 2; i++) {\n    dst.dword[i] = broadcast_value;\n  }\n}\n```\n\nTested on real machine.", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vldi (imm_n1024_1023 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vldi vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Initialize <code>dst</code> using predefined patterns:</p>\n<ul>\n<li><code>imm[12:10]=0b000</code>: broadcast <code>imm[7:0]</code> as 8-bit elements to all lanes</li>\n<li><code>imm[12:10]=0b001</code>: broadcast sign-extended <code>imm[9:0]</code> as 16-bit elements to all lanes</li>\n<li><code>imm[12:10]=0b010</code>: broadcast sign-extended <code>imm[9:0]</code> as 32-bit elements to all lanes</li>\n<li><code>imm[12:10]=0b011</code>: broadcast sign-extended <code>imm[9:0]</code> as 64-bit elements to all lanes</li>\n<li><code>imm[12:8]=0b10000</code>: broadcast <code>imm[7:0]</code> as 32-bit elements to all lanes</li>\n<li><code>imm[12:8]=0b10001</code>: broadcast <code>imm[7:0] &lt;&lt; 8</code> as 32-bit elements to all lanes</li>\n<li><code>imm[12:8]=0b10010</code>: broadcast <code>imm[7:0] &lt;&lt; 16</code> as 32-bit elements to all lanes</li>\n<li><code>imm[12:8]=0b10011</code>: broadcast <code>imm[7:0] &lt;&lt; 24</code> as 32-bit elements to all lanes</li>\n<li><code>imm[12:8]=0b10100</code>: broadcast <code>imm[7:0]</code> as 16-bit elements to all lanes</li>\n<li><code>imm[12:8]=0b10101</code>: broadcast <code>imm[7:0] &lt;&lt; 8</code> as 16-bit elements to all lanes</li>\n<li><code>imm[12:8]=0b10110</code>: broadcast <code>(imm[7:0] &lt;&lt; 8) | 0xFF</code> as 32-bit elements to all lanes</li>\n<li><code>imm[12:8]=0b10111</code>: broadcast <code>(imm[7:0] &lt;&lt; 16) | 0xFFFF</code> as 32-bit elements to all lanes</li>\n<li><code>imm[12:8]=0b11000</code>: broadcast <code>imm[7:0]</code> as 8-bit elements to all lanes</li>\n<li><code>imm[12:8]=0b11001</code>: repeat each bit of <code>imm[7:0]</code> eight times, and broadcast the result as 64-bit elements to all lanes</li>\n<li><code>imm[12:8]=0b11010</code>: broadcast <code>(imm[7] &lt;&lt; 31) | ((1-imm[6]) &lt;&lt; 30) | ((imm[6] * 0x1F) &lt;&lt; 25) | (imm[5:0] &lt;&lt; 19)</code> as 32-bit elements to all lanes</li>\n<li><code>imm[12:8]=0b11011</code>: broadcast <code>(imm[7] &lt;&lt; 31) | ((1-imm[6]) &lt;&lt; 30) | ((imm[6] * 0x1F) &lt;&lt; 25) | (imm[5:0] &lt;&lt; 19)</code> as 64-bit elements to all lanes</li>\n<li><code>imm[12:8]=0b11100</code>: broadcast <code>(imm[7] &lt;&lt; 63) | ((1-imm[6]) &lt;&lt; 62) | ((imm[6] * 0xFF) &lt;&lt; 54) | (imm[5:0] &lt;&lt; 48)</code> as 64-bit elements to all lanes</li>\n</ul>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">u64 imm12_10 = (imm &gt;&gt; 10) &amp; 0b111;\nu64 imm12_8 = (imm &gt;&gt; 8) &amp; 0b11111;\nu64 imm9_0 = imm &amp; 0x3FF;\ns64 simm9_0 = ((s64)imm9_0 &lt;&lt; 54) &gt;&gt; 54;\nu64 imm7_0 = imm &amp; 0xFF;\nu64 imm7 = (imm &gt;&gt; 7) &amp; 0x1;\nu64 imm6 = (imm &gt;&gt; 6) &amp; 0x1;\nu64 imm5 = (imm &gt;&gt; 5) &amp; 0x1;\nu64 imm5_0 = imm &amp; 0x3F;\nu64 imm4 = (imm &gt;&gt; 4) &amp; 0x1;\nu64 imm3 = (imm &gt;&gt; 3) &amp; 0x1;\nu64 imm2 = (imm &gt;&gt; 2) &amp; 0x1;\nu64 imm1 = (imm &gt;&gt; 1) &amp; 0x1;\nu64 imm0 = imm &amp; 0x1;\n\nu64 broadcast_value;\nu64 broadcast_width;\nif (imm12_10 == 0b000) {\n  broadcast_value = imm7_0;\n  broadcast_width = 8;\n} else if (imm12_10 == 0b001) {\n  broadcast_value = simm9_0;\n  broadcast_width = 16;\n} else if (imm12_10 == 0b010) {\n  broadcast_value = simm9_0;\n  broadcast_width = 32;\n} else if (imm12_10 == 0b011) {\n  broadcast_value = simm9_0;\n  broadcast_width = 64;\n} else if (imm12_8 == 0b10000) {\n  broadcast_value = imm7_0;\n  broadcast_width = 32;\n} else if (imm12_8 == 0b10001) {\n  broadcast_value = imm7_0 &lt;&lt; 8;\n  broadcast_width = 32;\n} else if (imm12_8 == 0b10010) {\n  broadcast_value = imm7_0 &lt;&lt; 16;\n  broadcast_width = 32;\n} else if (imm12_8 == 0b10011) {\n  broadcast_value = imm7_0 &lt;&lt; 24;\n  broadcast_width = 32;\n} else if (imm12_8 == 0b10100) {\n  broadcast_value = imm7_0;\n  broadcast_width = 16;\n} else if (imm12_8 == 0b10101) {\n  broadcast_value = imm7_0 &lt;&lt; 8;\n  broadcast_width = 16;\n} else if (imm12_8 == 0b10110) {\n  broadcast_value = (imm7_0 &lt;&lt; 8) | 0xFF;\n  broadcast_width = 32;\n} else if (imm12_8 == 0b10111) {\n  broadcast_value = (imm7_0 &lt;&lt; 16) | 0xFFFF;\n  broadcast_width = 32;\n} else if (imm12_8 == 0b11000) {\n  broadcast_value = imm7_0;\n  broadcast_width = 8;\n} else if (imm12_8 == 0b11001) {\n  broadcast_value = imm0 * 0xFF + imm1 * 0xFF00 + imm2 * 0xFF0000 +\n                    imm3 * 0xFF000000 + imm4 * 0xFF00000000 +\n                    imm5 * 0xFF0000000000 + imm6 * 0xFF000000000000 +\n                    imm7 * 0xFF00000000000000;\n  broadcast_width = 64;\n} else if (imm12_8 == 0b11010) {\n  broadcast_value = (imm7 &lt;&lt; 31) | ((1 - imm6) &lt;&lt; 30) | ((imm6 * 0x1F) &lt;&lt; 25) |\n                    (imm5_0 &lt;&lt; 19);\n  broadcast_width = 32;\n} else if (imm12_8 == 0b11011) {\n  broadcast_value = (imm7 &lt;&lt; 31) | ((1 - imm6) &lt;&lt; 30) | ((imm6 * 0x1F) &lt;&lt; 25) |\n                    (imm5_0 &lt;&lt; 19);\n  broadcast_width = 64;\n} else if (imm12_8 == 0b11100) {\n  broadcast_value = (imm7 &lt;&lt; 63) | ((1 - imm6) &lt;&lt; 62) | ((imm6 * 0xFF) &lt;&lt; 54) |\n                    (imm5_0 &lt;&lt; 48);\n  broadcast_width = 64;\n}\n\nif (broadcast_width == 8) {\n  for (int i = 0; i &lt; 16; i++) {\n    dst.byte[i] = broadcast_value;\n  }\n} else if (broadcast_width == 16) {\n  for (int i = 0; i &lt; 8; i++) {\n    dst.half[i] = broadcast_value;\n  }\n} else if (broadcast_width == 32) {\n  for (int i = 0; i &lt; 4; i++) {\n    dst.word[i] = broadcast_value;\n  }\n} else if (broadcast_width == 64) {\n  for (int i = 0; i &lt; 2; i++) {\n    dst.dword[i] = broadcast_value;\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vldrepl_b (void * addr, imm_n2048_2047 offset)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vldrepl_b (void * addr, imm_n2048_2047 offset)\n#include <lsxintrin.h>\nInstruction: vldrepl.b vr, r, imm\nCPU Flags: LSX\n```\n\n### Description\n\nRead 8-bit data from memory address `addr + (offset << 0)`, replicate the data to all vector lanes and save into `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nu8 data = memory_load(8, addr + offset);\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = data;\n}\n```", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vldrepl_b (void * addr, imm_n2048_2047 offset)\n#include &lt;lsxintrin.h&gt;\nInstruction: vldrepl.b vr, r, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Read 8-bit data from memory address <code>addr + (offset &lt;&lt; 0)</code>, replicate the data to all vector lanes and save into <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">u8 data = memory_load(8, addr + offset);\nfor (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = data;\n}\n</code></pre>", "group": "Memory Load & Store", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vldrepl_d (void * addr, imm_n256_255 offset)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vldrepl_d (void * addr, imm_n256_255 offset)\n#include <lsxintrin.h>\nInstruction: vldrepl.d vr, r, imm\nCPU Flags: LSX\n```\n\n### Description\n\nRead 64-bit data from memory address `addr + (offset << 3)`, replicate the data to all vector lanes and save into `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nu64 data = memory_load(64, addr + (offset << 3));\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = data;\n}\n```", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vldrepl_d (void * addr, imm_n256_255 offset)\n#include &lt;lsxintrin.h&gt;\nInstruction: vldrepl.d vr, r, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Read 64-bit data from memory address <code>addr + (offset &lt;&lt; 3)</code>, replicate the data to all vector lanes and save into <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">u64 data = memory_load(64, addr + (offset &lt;&lt; 3));\nfor (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = data;\n}\n</code></pre>", "group": "Memory Load & Store", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vldrepl_h (void * addr, imm_n1024_1023 offset)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vldrepl_h (void * addr, imm_n1024_1023 offset)\n#include <lsxintrin.h>\nInstruction: vldrepl.h vr, r, imm\nCPU Flags: LSX\n```\n\n### Description\n\nRead 16-bit data from memory address `addr + (offset << 1)`, replicate the data to all vector lanes and save into `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nu16 data = memory_load(16, addr + (offset << 1));\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = data;\n}\n```", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vldrepl_h (void * addr, imm_n1024_1023 offset)\n#include &lt;lsxintrin.h&gt;\nInstruction: vldrepl.h vr, r, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Read 16-bit data from memory address <code>addr + (offset &lt;&lt; 1)</code>, replicate the data to all vector lanes and save into <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">u16 data = memory_load(16, addr + (offset &lt;&lt; 1));\nfor (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = data;\n}\n</code></pre>", "group": "Memory Load & Store", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vldrepl_w (void * addr, imm_n512_511 offset)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vldrepl_w (void * addr, imm_n512_511 offset)\n#include <lsxintrin.h>\nInstruction: vldrepl.w vr, r, imm\nCPU Flags: LSX\n```\n\n### Description\n\nRead 32-bit data from memory address `addr + (offset << 2)`, replicate the data to all vector lanes and save into `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nu32 data = memory_load(32, addr + (offset << 2));\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = data;\n}\n```", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vldrepl_w (void * addr, imm_n512_511 offset)\n#include &lt;lsxintrin.h&gt;\nInstruction: vldrepl.w vr, r, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Read 32-bit data from memory address <code>addr + (offset &lt;&lt; 2)</code>, replicate the data to all vector lanes and save into <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">u32 data = memory_load(32, addr + (offset &lt;&lt; 2));\nfor (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = data;\n}\n</code></pre>", "group": "Memory Load & Store", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vldx (void * addr, long int offset)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vldx (void * addr, long int offset)\n#include <lsxintrin.h>\nInstruction: vldx vr, r, r\nCPU Flags: LSX\n```\n\n### Description\n\nRead whole vector from memory address `addr + offset`, save the data into `dst`.  Note that you can use this intrinsic to load floating point vectors, even though the return type represents integer vectors.\n\n\n\n\n\n### Operation\n\n```c++\ndst = memory_load(128, addr + offset);\n```", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vldx (void * addr, long int offset)\n#include &lt;lsxintrin.h&gt;\nInstruction: vldx vr, r, r\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Read whole vector from memory address <code>addr + offset</code>, save the data into <code>dst</code>.  Note that you can use this intrinsic to load floating point vectors, even though the return type represents integer vectors.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">dst = memory_load(128, addr + offset);\n</code></pre>", "group": "Memory Load & Store", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmadd_b (__m128i a, __m128i b, __m128i c)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmadd_b (__m128i a, __m128i b, __m128i c)\n#include <lsxintrin.h>\nInstruction: vmadd.b vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nMultiply 8-bit elements in `b` and `c`, add to elements in `a`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = b.byte[i] * c.byte[i] + a.byte[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmadd_b (__m128i a, __m128i b, __m128i c)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmadd.b vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply 8-bit elements in <code>b</code> and <code>c</code>, add to elements in <code>a</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = b.byte[i] * c.byte[i] + a.byte[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmadd_d (__m128i a, __m128i b, __m128i c)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmadd_d (__m128i a, __m128i b, __m128i c)\n#include <lsxintrin.h>\nInstruction: vmadd.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nMultiply 64-bit elements in `b` and `c`, add to elements in `a`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = b.dword[i] * c.dword[i] + a.dword[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmadd_d (__m128i a, __m128i b, __m128i c)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmadd.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply 64-bit elements in <code>b</code> and <code>c</code>, add to elements in <code>a</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = b.dword[i] * c.dword[i] + a.dword[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmadd_h (__m128i a, __m128i b, __m128i c)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmadd_h (__m128i a, __m128i b, __m128i c)\n#include <lsxintrin.h>\nInstruction: vmadd.h vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nMultiply 16-bit elements in `b` and `c`, add to elements in `a`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = b.half[i] * c.half[i] + a.half[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmadd_h (__m128i a, __m128i b, __m128i c)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmadd.h vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply 16-bit elements in <code>b</code> and <code>c</code>, add to elements in <code>a</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = b.half[i] * c.half[i] + a.half[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmadd_w (__m128i a, __m128i b, __m128i c)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmadd_w (__m128i a, __m128i b, __m128i c)\n#include <lsxintrin.h>\nInstruction: vmadd.w vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nMultiply 32-bit elements in `b` and `c`, add to elements in `a`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = b.word[i] * c.word[i] + a.word[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmadd_w (__m128i a, __m128i b, __m128i c)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmadd.w vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply 32-bit elements in <code>b</code> and <code>c</code>, add to elements in <code>a</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = b.word[i] * c.word[i] + a.word[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmaddwev_d_w (__m128i a, __m128i b, __m128i c)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmaddwev_d_w (__m128i a, __m128i b, __m128i c)\n#include <lsxintrin.h>\nInstruction: vmaddwev.d.w vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nMultiply even-positioned signed 32-bit elements in `b` and signed elements in `c`, add to 64-bit elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] =\n      (s64)(s32)b.word[2 * i] * (s64)(s32)c.word[2 * i] + (s64)a.dword[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmaddwev_d_w (__m128i a, __m128i b, __m128i c)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmaddwev.d.w vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply even-positioned signed 32-bit elements in <code>b</code> and signed elements in <code>c</code>, add to 64-bit elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] =\n      (s64)(s32)b.word[2 * i] * (s64)(s32)c.word[2 * i] + (s64)a.dword[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmaddwev_d_wu (__m128i a, __m128i b, __m128i c)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmaddwev_d_wu (__m128i a, __m128i b, __m128i c)\n#include <lsxintrin.h>\nInstruction: vmaddwev.d.wu vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nMultiply even-positioned unsigned 32-bit elements in `b` and unsigned elements in `c`, add to 64-bit elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] =\n      (u64)(u32)b.word[2 * i] * (u64)(u32)c.word[2 * i] + (u64)a.dword[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmaddwev_d_wu (__m128i a, __m128i b, __m128i c)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmaddwev.d.wu vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply even-positioned unsigned 32-bit elements in <code>b</code> and unsigned elements in <code>c</code>, add to 64-bit elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] =\n      (u64)(u32)b.word[2 * i] * (u64)(u32)c.word[2 * i] + (u64)a.dword[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmaddwev_d_wu_w (__m128i a, __m128i b, __m128i c)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmaddwev_d_wu_w (__m128i a, __m128i b, __m128i c)\n#include <lsxintrin.h>\nInstruction: vmaddwev.d.wu.w vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nMultiply even-positioned unsigned 32-bit elements in `b` and signed elements in `c`, add to 64-bit elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] =\n      (u64)(u32)b.word[2 * i] * (s64)(s32)c.word[2 * i] + (s64)a.dword[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmaddwev_d_wu_w (__m128i a, __m128i b, __m128i c)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmaddwev.d.wu.w vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply even-positioned unsigned 32-bit elements in <code>b</code> and signed elements in <code>c</code>, add to 64-bit elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] =\n      (u64)(u32)b.word[2 * i] * (s64)(s32)c.word[2 * i] + (s64)a.dword[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmaddwev_h_b (__m128i a, __m128i b, __m128i c)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmaddwev_h_b (__m128i a, __m128i b, __m128i c)\n#include <lsxintrin.h>\nInstruction: vmaddwev.h.b vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nMultiply even-positioned signed 8-bit elements in `b` and signed elements in `c`, add to 16-bit elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] =\n      (s16)(s8)b.byte[2 * i] * (s16)(s8)c.byte[2 * i] + (s16)a.half[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmaddwev_h_b (__m128i a, __m128i b, __m128i c)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmaddwev.h.b vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply even-positioned signed 8-bit elements in <code>b</code> and signed elements in <code>c</code>, add to 16-bit elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] =\n      (s16)(s8)b.byte[2 * i] * (s16)(s8)c.byte[2 * i] + (s16)a.half[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmaddwev_h_bu (__m128i a, __m128i b, __m128i c)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmaddwev_h_bu (__m128i a, __m128i b, __m128i c)\n#include <lsxintrin.h>\nInstruction: vmaddwev.h.bu vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nMultiply even-positioned unsigned 8-bit elements in `b` and unsigned elements in `c`, add to 16-bit elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] =\n      (u16)(u8)b.byte[2 * i] * (u16)(u8)c.byte[2 * i] + (u16)a.half[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmaddwev_h_bu (__m128i a, __m128i b, __m128i c)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmaddwev.h.bu vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply even-positioned unsigned 8-bit elements in <code>b</code> and unsigned elements in <code>c</code>, add to 16-bit elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] =\n      (u16)(u8)b.byte[2 * i] * (u16)(u8)c.byte[2 * i] + (u16)a.half[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmaddwev_h_bu_b (__m128i a, __m128i b, __m128i c)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmaddwev_h_bu_b (__m128i a, __m128i b, __m128i c)\n#include <lsxintrin.h>\nInstruction: vmaddwev.h.bu.b vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nMultiply even-positioned unsigned 8-bit elements in `b` and signed elements in `c`, add to 16-bit elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] =\n      (u16)(u8)b.byte[2 * i] * (s16)(s8)c.byte[2 * i] + (s16)a.half[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmaddwev_h_bu_b (__m128i a, __m128i b, __m128i c)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmaddwev.h.bu.b vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply even-positioned unsigned 8-bit elements in <code>b</code> and signed elements in <code>c</code>, add to 16-bit elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] =\n      (u16)(u8)b.byte[2 * i] * (s16)(s8)c.byte[2 * i] + (s16)a.half[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmaddwev_q_d (__m128i a, __m128i b, __m128i c)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmaddwev_q_d (__m128i a, __m128i b, __m128i c)\n#include <lsxintrin.h>\nInstruction: vmaddwev.q.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nMultiply even-positioned signed 64-bit elements in `b` and signed elements in `c`, add to 128-bit elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 1; i++) {\n  dst.qword[i] =\n      (s128)(s64)b.dword[2 * i] * (s128)(s64)c.dword[2 * i] + (s128)a.qword[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 7 | 1.14 |\n| 3C5000 | 7 | 1.14 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmaddwev_q_d (__m128i a, __m128i b, __m128i c)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmaddwev.q.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply even-positioned signed 64-bit elements in <code>b</code> and signed elements in <code>c</code>, add to 128-bit elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 1; i++) {\n  dst.qword[i] =\n      (s128)(s64)b.dword[2 * i] * (s128)(s64)c.dword[2 * i] + (s128)a.qword[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>7</td>\n<td>1.14</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>7</td>\n<td>1.14</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmaddwev_q_du (__m128i a, __m128i b, __m128i c)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmaddwev_q_du (__m128i a, __m128i b, __m128i c)\n#include <lsxintrin.h>\nInstruction: vmaddwev.q.du vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nMultiply even-positioned unsigned 64-bit elements in `b` and unsigned elements in `c`, add to 128-bit elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 1; i++) {\n  dst.qword[i] =\n      (u128)(u64)b.dword[2 * i] * (u128)(u64)c.dword[2 * i] + (u128)a.qword[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 7 | 1.14 |\n| 3C5000 | 7 | 1.14 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmaddwev_q_du (__m128i a, __m128i b, __m128i c)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmaddwev.q.du vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply even-positioned unsigned 64-bit elements in <code>b</code> and unsigned elements in <code>c</code>, add to 128-bit elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 1; i++) {\n  dst.qword[i] =\n      (u128)(u64)b.dword[2 * i] * (u128)(u64)c.dword[2 * i] + (u128)a.qword[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>7</td>\n<td>1.14</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>7</td>\n<td>1.14</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmaddwev_q_du_d (__m128i a, __m128i b, __m128i c)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmaddwev_q_du_d (__m128i a, __m128i b, __m128i c)\n#include <lsxintrin.h>\nInstruction: vmaddwev.q.du.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nMultiply even-positioned unsigned 64-bit elements in `b` and signed elements in `c`, add to 128-bit elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 1; i++) {\n  dst.qword[i] =\n      (u128)(u64)b.dword[2 * i] * (s128)(s64)c.dword[2 * i] + (s128)a.qword[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 7 | 1.14 |\n| 3C5000 | 7 | 1.14 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmaddwev_q_du_d (__m128i a, __m128i b, __m128i c)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmaddwev.q.du.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply even-positioned unsigned 64-bit elements in <code>b</code> and signed elements in <code>c</code>, add to 128-bit elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 1; i++) {\n  dst.qword[i] =\n      (u128)(u64)b.dword[2 * i] * (s128)(s64)c.dword[2 * i] + (s128)a.qword[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>7</td>\n<td>1.14</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>7</td>\n<td>1.14</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmaddwev_w_h (__m128i a, __m128i b, __m128i c)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmaddwev_w_h (__m128i a, __m128i b, __m128i c)\n#include <lsxintrin.h>\nInstruction: vmaddwev.w.h vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nMultiply even-positioned signed 16-bit elements in `b` and signed elements in `c`, add to 32-bit elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] =\n      (s32)(s16)b.half[2 * i] * (s32)(s16)c.half[2 * i] + (s32)a.word[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmaddwev_w_h (__m128i a, __m128i b, __m128i c)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmaddwev.w.h vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply even-positioned signed 16-bit elements in <code>b</code> and signed elements in <code>c</code>, add to 32-bit elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] =\n      (s32)(s16)b.half[2 * i] * (s32)(s16)c.half[2 * i] + (s32)a.word[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmaddwev_w_hu (__m128i a, __m128i b, __m128i c)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmaddwev_w_hu (__m128i a, __m128i b, __m128i c)\n#include <lsxintrin.h>\nInstruction: vmaddwev.w.hu vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nMultiply even-positioned unsigned 16-bit elements in `b` and unsigned elements in `c`, add to 32-bit elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] =\n      (u32)(u16)b.half[2 * i] * (u32)(u16)c.half[2 * i] + (u32)a.word[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmaddwev_w_hu (__m128i a, __m128i b, __m128i c)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmaddwev.w.hu vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply even-positioned unsigned 16-bit elements in <code>b</code> and unsigned elements in <code>c</code>, add to 32-bit elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] =\n      (u32)(u16)b.half[2 * i] * (u32)(u16)c.half[2 * i] + (u32)a.word[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmaddwev_w_hu_h (__m128i a, __m128i b, __m128i c)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmaddwev_w_hu_h (__m128i a, __m128i b, __m128i c)\n#include <lsxintrin.h>\nInstruction: vmaddwev.w.hu.h vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nMultiply even-positioned unsigned 16-bit elements in `b` and signed elements in `c`, add to 32-bit elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] =\n      (u32)(u16)b.half[2 * i] * (s32)(s16)c.half[2 * i] + (s32)a.word[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmaddwev_w_hu_h (__m128i a, __m128i b, __m128i c)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmaddwev.w.hu.h vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply even-positioned unsigned 16-bit elements in <code>b</code> and signed elements in <code>c</code>, add to 32-bit elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] =\n      (u32)(u16)b.half[2 * i] * (s32)(s16)c.half[2 * i] + (s32)a.word[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmaddwod_d_w (__m128i a, __m128i b, __m128i c)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmaddwod_d_w (__m128i a, __m128i b, __m128i c)\n#include <lsxintrin.h>\nInstruction: vmaddwod.d.w vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nMultiply odd-positioned signed 32-bit elements in `b` and signed elements in `c`, add to 64-bit elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = (s64)(s32)b.word[2 * i + 1] * (s64)(s32)c.word[2 * i + 1] +\n                 (s64)a.dword[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmaddwod_d_w (__m128i a, __m128i b, __m128i c)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmaddwod.d.w vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply odd-positioned signed 32-bit elements in <code>b</code> and signed elements in <code>c</code>, add to 64-bit elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = (s64)(s32)b.word[2 * i + 1] * (s64)(s32)c.word[2 * i + 1] +\n                 (s64)a.dword[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmaddwod_d_wu (__m128i a, __m128i b, __m128i c)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmaddwod_d_wu (__m128i a, __m128i b, __m128i c)\n#include <lsxintrin.h>\nInstruction: vmaddwod.d.wu vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nMultiply odd-positioned unsigned 32-bit elements in `b` and unsigned elements in `c`, add to 64-bit elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = (u64)(u32)b.word[2 * i + 1] * (u64)(u32)c.word[2 * i + 1] +\n                 (u64)a.dword[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmaddwod_d_wu (__m128i a, __m128i b, __m128i c)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmaddwod.d.wu vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply odd-positioned unsigned 32-bit elements in <code>b</code> and unsigned elements in <code>c</code>, add to 64-bit elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = (u64)(u32)b.word[2 * i + 1] * (u64)(u32)c.word[2 * i + 1] +\n                 (u64)a.dword[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmaddwod_d_wu_w (__m128i a, __m128i b, __m128i c)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmaddwod_d_wu_w (__m128i a, __m128i b, __m128i c)\n#include <lsxintrin.h>\nInstruction: vmaddwod.d.wu.w vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nMultiply odd-positioned unsigned 32-bit elements in `b` and signed elements in `c`, add to 64-bit elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = (u64)(u32)b.word[2 * i + 1] * (s64)(s32)c.word[2 * i + 1] +\n                 (s64)a.dword[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmaddwod_d_wu_w (__m128i a, __m128i b, __m128i c)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmaddwod.d.wu.w vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply odd-positioned unsigned 32-bit elements in <code>b</code> and signed elements in <code>c</code>, add to 64-bit elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = (u64)(u32)b.word[2 * i + 1] * (s64)(s32)c.word[2 * i + 1] +\n                 (s64)a.dword[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmaddwod_h_b (__m128i a, __m128i b, __m128i c)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmaddwod_h_b (__m128i a, __m128i b, __m128i c)\n#include <lsxintrin.h>\nInstruction: vmaddwod.h.b vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nMultiply odd-positioned signed 8-bit elements in `b` and signed elements in `c`, add to 16-bit elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] =\n      (s16)(s8)b.byte[2 * i + 1] * (s16)(s8)c.byte[2 * i + 1] + (s16)a.half[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmaddwod_h_b (__m128i a, __m128i b, __m128i c)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmaddwod.h.b vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply odd-positioned signed 8-bit elements in <code>b</code> and signed elements in <code>c</code>, add to 16-bit elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] =\n      (s16)(s8)b.byte[2 * i + 1] * (s16)(s8)c.byte[2 * i + 1] + (s16)a.half[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmaddwod_h_bu (__m128i a, __m128i b, __m128i c)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmaddwod_h_bu (__m128i a, __m128i b, __m128i c)\n#include <lsxintrin.h>\nInstruction: vmaddwod.h.bu vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nMultiply odd-positioned unsigned 8-bit elements in `b` and unsigned elements in `c`, add to 16-bit elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] =\n      (u16)(u8)b.byte[2 * i + 1] * (u16)(u8)c.byte[2 * i + 1] + (u16)a.half[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmaddwod_h_bu (__m128i a, __m128i b, __m128i c)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmaddwod.h.bu vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply odd-positioned unsigned 8-bit elements in <code>b</code> and unsigned elements in <code>c</code>, add to 16-bit elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] =\n      (u16)(u8)b.byte[2 * i + 1] * (u16)(u8)c.byte[2 * i + 1] + (u16)a.half[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmaddwod_h_bu_b (__m128i a, __m128i b, __m128i c)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmaddwod_h_bu_b (__m128i a, __m128i b, __m128i c)\n#include <lsxintrin.h>\nInstruction: vmaddwod.h.bu.b vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nMultiply odd-positioned unsigned 8-bit elements in `b` and signed elements in `c`, add to 16-bit elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] =\n      (u16)(u8)b.byte[2 * i + 1] * (s16)(s8)c.byte[2 * i + 1] + (s16)a.half[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmaddwod_h_bu_b (__m128i a, __m128i b, __m128i c)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmaddwod.h.bu.b vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply odd-positioned unsigned 8-bit elements in <code>b</code> and signed elements in <code>c</code>, add to 16-bit elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] =\n      (u16)(u8)b.byte[2 * i + 1] * (s16)(s8)c.byte[2 * i + 1] + (s16)a.half[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmaddwod_q_d (__m128i a, __m128i b, __m128i c)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmaddwod_q_d (__m128i a, __m128i b, __m128i c)\n#include <lsxintrin.h>\nInstruction: vmaddwod.q.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nMultiply odd-positioned signed 64-bit elements in `b` and signed elements in `c`, add to 128-bit elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 1; i++) {\n  dst.qword[i] = (s128)(s64)b.dword[2 * i + 1] * (s128)(s64)c.dword[2 * i + 1] +\n                 (s128)a.qword[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 7 | 1.14 |\n| 3C5000 | 7 | 1.14 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmaddwod_q_d (__m128i a, __m128i b, __m128i c)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmaddwod.q.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply odd-positioned signed 64-bit elements in <code>b</code> and signed elements in <code>c</code>, add to 128-bit elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 1; i++) {\n  dst.qword[i] = (s128)(s64)b.dword[2 * i + 1] * (s128)(s64)c.dword[2 * i + 1] +\n                 (s128)a.qword[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>7</td>\n<td>1.14</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>7</td>\n<td>1.14</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmaddwod_q_du (__m128i a, __m128i b, __m128i c)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmaddwod_q_du (__m128i a, __m128i b, __m128i c)\n#include <lsxintrin.h>\nInstruction: vmaddwod.q.du vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nMultiply odd-positioned unsigned 64-bit elements in `b` and unsigned elements in `c`, add to 128-bit elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 1; i++) {\n  dst.qword[i] = (u128)(u64)b.dword[2 * i + 1] * (u128)(u64)c.dword[2 * i + 1] +\n                 (u128)a.qword[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 7 | 1.14 |\n| 3C5000 | 7 | 1.14 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmaddwod_q_du (__m128i a, __m128i b, __m128i c)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmaddwod.q.du vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply odd-positioned unsigned 64-bit elements in <code>b</code> and unsigned elements in <code>c</code>, add to 128-bit elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 1; i++) {\n  dst.qword[i] = (u128)(u64)b.dword[2 * i + 1] * (u128)(u64)c.dword[2 * i + 1] +\n                 (u128)a.qword[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>7</td>\n<td>1.14</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>7</td>\n<td>1.14</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmaddwod_q_du_d (__m128i a, __m128i b, __m128i c)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmaddwod_q_du_d (__m128i a, __m128i b, __m128i c)\n#include <lsxintrin.h>\nInstruction: vmaddwod.q.du.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nMultiply odd-positioned unsigned 64-bit elements in `b` and signed elements in `c`, add to 128-bit elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 1; i++) {\n  dst.qword[i] = (u128)(u64)b.dword[2 * i + 1] * (s128)(s64)c.dword[2 * i + 1] +\n                 (s128)a.qword[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 7 | 1.14 |\n| 3C5000 | 7 | 1.14 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmaddwod_q_du_d (__m128i a, __m128i b, __m128i c)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmaddwod.q.du.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply odd-positioned unsigned 64-bit elements in <code>b</code> and signed elements in <code>c</code>, add to 128-bit elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 1; i++) {\n  dst.qword[i] = (u128)(u64)b.dword[2 * i + 1] * (s128)(s64)c.dword[2 * i + 1] +\n                 (s128)a.qword[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>7</td>\n<td>1.14</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>7</td>\n<td>1.14</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmaddwod_w_h (__m128i a, __m128i b, __m128i c)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmaddwod_w_h (__m128i a, __m128i b, __m128i c)\n#include <lsxintrin.h>\nInstruction: vmaddwod.w.h vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nMultiply odd-positioned signed 16-bit elements in `b` and signed elements in `c`, add to 32-bit elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = (s32)(s16)b.half[2 * i + 1] * (s32)(s16)c.half[2 * i + 1] +\n                (s32)a.word[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmaddwod_w_h (__m128i a, __m128i b, __m128i c)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmaddwod.w.h vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply odd-positioned signed 16-bit elements in <code>b</code> and signed elements in <code>c</code>, add to 32-bit elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = (s32)(s16)b.half[2 * i + 1] * (s32)(s16)c.half[2 * i + 1] +\n                (s32)a.word[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmaddwod_w_hu (__m128i a, __m128i b, __m128i c)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmaddwod_w_hu (__m128i a, __m128i b, __m128i c)\n#include <lsxintrin.h>\nInstruction: vmaddwod.w.hu vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nMultiply odd-positioned unsigned 16-bit elements in `b` and unsigned elements in `c`, add to 32-bit elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = (u32)(u16)b.half[2 * i + 1] * (u32)(u16)c.half[2 * i + 1] +\n                (u32)a.word[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmaddwod_w_hu (__m128i a, __m128i b, __m128i c)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmaddwod.w.hu vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply odd-positioned unsigned 16-bit elements in <code>b</code> and unsigned elements in <code>c</code>, add to 32-bit elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = (u32)(u16)b.half[2 * i + 1] * (u32)(u16)c.half[2 * i + 1] +\n                (u32)a.word[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmaddwod_w_hu_h (__m128i a, __m128i b, __m128i c)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmaddwod_w_hu_h (__m128i a, __m128i b, __m128i c)\n#include <lsxintrin.h>\nInstruction: vmaddwod.w.hu.h vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nMultiply odd-positioned unsigned 16-bit elements in `b` and signed elements in `c`, add to 32-bit elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = (u32)(u16)b.half[2 * i + 1] * (s32)(s16)c.half[2 * i + 1] +\n                (s32)a.word[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmaddwod_w_hu_h (__m128i a, __m128i b, __m128i c)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmaddwod.w.hu.h vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply odd-positioned unsigned 16-bit elements in <code>b</code> and signed elements in <code>c</code>, add to 32-bit elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = (u32)(u16)b.half[2 * i + 1] * (s32)(s16)c.half[2 * i + 1] +\n                (s32)a.word[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmax_b (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmax_b (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vmax.b vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompute elementwise maximum for signed 8-bit elements in `a` and `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = max((s8)a.byte[i], (s8)b.byte[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmax_b (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmax.b vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute elementwise maximum for signed 8-bit elements in <code>a</code> and <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = max((s8)a.byte[i], (s8)b.byte[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmax_bu (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmax_bu (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vmax.bu vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompute elementwise maximum for unsigned 8-bit elements in `a` and `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = max((u8)a.byte[i], (u8)b.byte[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmax_bu (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmax.bu vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute elementwise maximum for unsigned 8-bit elements in <code>a</code> and <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = max((u8)a.byte[i], (u8)b.byte[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmax_d (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmax_d (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vmax.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompute elementwise maximum for signed 64-bit elements in `a` and `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = max((s64)a.dword[i], (s64)b.dword[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmax_d (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmax.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute elementwise maximum for signed 64-bit elements in <code>a</code> and <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = max((s64)a.dword[i], (s64)b.dword[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmax_du (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmax_du (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vmax.du vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompute elementwise maximum for unsigned 64-bit elements in `a` and `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = max((u64)a.dword[i], (u64)b.dword[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmax_du (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmax.du vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute elementwise maximum for unsigned 64-bit elements in <code>a</code> and <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = max((u64)a.dword[i], (u64)b.dword[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmax_h (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmax_h (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vmax.h vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompute elementwise maximum for signed 16-bit elements in `a` and `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = max((s16)a.half[i], (s16)b.half[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmax_h (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmax.h vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute elementwise maximum for signed 16-bit elements in <code>a</code> and <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = max((s16)a.half[i], (s16)b.half[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmax_hu (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmax_hu (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vmax.hu vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompute elementwise maximum for unsigned 16-bit elements in `a` and `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = max((u16)a.half[i], (u16)b.half[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmax_hu (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmax.hu vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute elementwise maximum for unsigned 16-bit elements in <code>a</code> and <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = max((u16)a.half[i], (u16)b.half[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmax_w (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmax_w (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vmax.w vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompute elementwise maximum for signed 32-bit elements in `a` and `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = max((s32)a.word[i], (s32)b.word[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmax_w (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmax.w vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute elementwise maximum for signed 32-bit elements in <code>a</code> and <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = max((s32)a.word[i], (s32)b.word[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmax_wu (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmax_wu (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vmax.wu vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompute elementwise maximum for unsigned 32-bit elements in `a` and `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = max((u32)a.word[i], (u32)b.word[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmax_wu (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmax.wu vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute elementwise maximum for unsigned 32-bit elements in <code>a</code> and <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = max((u32)a.word[i], (u32)b.word[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmaxi_b (__m128i a, imm_n16_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmaxi_b (__m128i a, imm_n16_15 imm)\n#include <lsxintrin.h>\nInstruction: vmaxi.b vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nCompute elementwise maximum for signed 8-bit elements in `a` and `imm`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = max((s8)a.byte[i], (s8)imm);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmaxi_b (__m128i a, imm_n16_15 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmaxi.b vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute elementwise maximum for signed 8-bit elements in <code>a</code> and <code>imm</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = max((s8)a.byte[i], (s8)imm);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmaxi_bu (__m128i a, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmaxi_bu (__m128i a, imm0_31 imm)\n#include <lsxintrin.h>\nInstruction: vmaxi.bu vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nCompute elementwise maximum for unsigned 8-bit elements in `a` and `imm`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = max((u8)a.byte[i], (u8)imm);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmaxi_bu (__m128i a, imm0_31 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmaxi.bu vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute elementwise maximum for unsigned 8-bit elements in <code>a</code> and <code>imm</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = max((u8)a.byte[i], (u8)imm);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmaxi_d (__m128i a, imm_n16_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmaxi_d (__m128i a, imm_n16_15 imm)\n#include <lsxintrin.h>\nInstruction: vmaxi.d vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nCompute elementwise maximum for signed 64-bit elements in `a` and `imm`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = max((s64)a.dword[i], (s64)imm);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmaxi_d (__m128i a, imm_n16_15 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmaxi.d vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute elementwise maximum for signed 64-bit elements in <code>a</code> and <code>imm</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = max((s64)a.dword[i], (s64)imm);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmaxi_du (__m128i a, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmaxi_du (__m128i a, imm0_31 imm)\n#include <lsxintrin.h>\nInstruction: vmaxi.du vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nCompute elementwise maximum for unsigned 64-bit elements in `a` and `imm`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = max((u64)a.dword[i], (u64)imm);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmaxi_du (__m128i a, imm0_31 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmaxi.du vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute elementwise maximum for unsigned 64-bit elements in <code>a</code> and <code>imm</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = max((u64)a.dword[i], (u64)imm);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmaxi_h (__m128i a, imm_n16_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmaxi_h (__m128i a, imm_n16_15 imm)\n#include <lsxintrin.h>\nInstruction: vmaxi.h vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nCompute elementwise maximum for signed 16-bit elements in `a` and `imm`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = max((s16)a.half[i], (s16)imm);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmaxi_h (__m128i a, imm_n16_15 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmaxi.h vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute elementwise maximum for signed 16-bit elements in <code>a</code> and <code>imm</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = max((s16)a.half[i], (s16)imm);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmaxi_hu (__m128i a, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmaxi_hu (__m128i a, imm0_31 imm)\n#include <lsxintrin.h>\nInstruction: vmaxi.hu vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nCompute elementwise maximum for unsigned 16-bit elements in `a` and `imm`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = max((u16)a.half[i], (u16)imm);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmaxi_hu (__m128i a, imm0_31 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmaxi.hu vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute elementwise maximum for unsigned 16-bit elements in <code>a</code> and <code>imm</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = max((u16)a.half[i], (u16)imm);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmaxi_w (__m128i a, imm_n16_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmaxi_w (__m128i a, imm_n16_15 imm)\n#include <lsxintrin.h>\nInstruction: vmaxi.w vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nCompute elementwise maximum for signed 32-bit elements in `a` and `imm`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = max((s32)a.word[i], (s32)imm);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmaxi_w (__m128i a, imm_n16_15 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmaxi.w vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute elementwise maximum for signed 32-bit elements in <code>a</code> and <code>imm</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = max((s32)a.word[i], (s32)imm);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmaxi_wu (__m128i a, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmaxi_wu (__m128i a, imm0_31 imm)\n#include <lsxintrin.h>\nInstruction: vmaxi.wu vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nCompute elementwise maximum for unsigned 32-bit elements in `a` and `imm`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = max((u32)a.word[i], (u32)imm);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmaxi_wu (__m128i a, imm0_31 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmaxi.wu vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute elementwise maximum for unsigned 32-bit elements in <code>a</code> and <code>imm</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = max((u32)a.word[i], (u32)imm);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmin_b (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmin_b (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vmin.b vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompute elementwise minimum for signed 8-bit elements in `a` and `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = min((s8)a.byte[i], (s8)b.byte[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmin_b (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmin.b vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute elementwise minimum for signed 8-bit elements in <code>a</code> and <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = min((s8)a.byte[i], (s8)b.byte[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmin_bu (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmin_bu (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vmin.bu vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompute elementwise minimum for unsigned 8-bit elements in `a` and `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = min((u8)a.byte[i], (u8)b.byte[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmin_bu (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmin.bu vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute elementwise minimum for unsigned 8-bit elements in <code>a</code> and <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = min((u8)a.byte[i], (u8)b.byte[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmin_d (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmin_d (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vmin.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompute elementwise minimum for signed 64-bit elements in `a` and `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = min((s64)a.dword[i], (s64)b.dword[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmin_d (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmin.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute elementwise minimum for signed 64-bit elements in <code>a</code> and <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = min((s64)a.dword[i], (s64)b.dword[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmin_du (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmin_du (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vmin.du vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompute elementwise minimum for unsigned 64-bit elements in `a` and `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = min((u64)a.dword[i], (u64)b.dword[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmin_du (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmin.du vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute elementwise minimum for unsigned 64-bit elements in <code>a</code> and <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = min((u64)a.dword[i], (u64)b.dword[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmin_h (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmin_h (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vmin.h vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompute elementwise minimum for signed 16-bit elements in `a` and `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = min((s16)a.half[i], (s16)b.half[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmin_h (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmin.h vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute elementwise minimum for signed 16-bit elements in <code>a</code> and <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = min((s16)a.half[i], (s16)b.half[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmin_hu (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmin_hu (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vmin.hu vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompute elementwise minimum for unsigned 16-bit elements in `a` and `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = min((u16)a.half[i], (u16)b.half[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmin_hu (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmin.hu vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute elementwise minimum for unsigned 16-bit elements in <code>a</code> and <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = min((u16)a.half[i], (u16)b.half[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmin_w (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmin_w (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vmin.w vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompute elementwise minimum for signed 32-bit elements in `a` and `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = min((s32)a.word[i], (s32)b.word[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmin_w (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmin.w vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute elementwise minimum for signed 32-bit elements in <code>a</code> and <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = min((s32)a.word[i], (s32)b.word[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmin_wu (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmin_wu (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vmin.wu vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompute elementwise minimum for unsigned 32-bit elements in `a` and `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = min((u32)a.word[i], (u32)b.word[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmin_wu (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmin.wu vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute elementwise minimum for unsigned 32-bit elements in <code>a</code> and <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = min((u32)a.word[i], (u32)b.word[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmini_b (__m128i a, imm_n16_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmini_b (__m128i a, imm_n16_15 imm)\n#include <lsxintrin.h>\nInstruction: vmini.b vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nCompute elementwise minimum for signed 8-bit elements in `a` and `imm`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = min((s8)a.byte[i], (s8)imm);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmini_b (__m128i a, imm_n16_15 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmini.b vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute elementwise minimum for signed 8-bit elements in <code>a</code> and <code>imm</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = min((s8)a.byte[i], (s8)imm);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmini_bu (__m128i a, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmini_bu (__m128i a, imm0_31 imm)\n#include <lsxintrin.h>\nInstruction: vmini.bu vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nCompute elementwise minimum for unsigned 8-bit elements in `a` and `imm`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = min((u8)a.byte[i], (u8)imm);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmini_bu (__m128i a, imm0_31 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmini.bu vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute elementwise minimum for unsigned 8-bit elements in <code>a</code> and <code>imm</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = min((u8)a.byte[i], (u8)imm);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmini_d (__m128i a, imm_n16_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmini_d (__m128i a, imm_n16_15 imm)\n#include <lsxintrin.h>\nInstruction: vmini.d vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nCompute elementwise minimum for signed 64-bit elements in `a` and `imm`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = min((s64)a.dword[i], (s64)imm);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmini_d (__m128i a, imm_n16_15 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmini.d vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute elementwise minimum for signed 64-bit elements in <code>a</code> and <code>imm</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = min((s64)a.dword[i], (s64)imm);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmini_du (__m128i a, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmini_du (__m128i a, imm0_31 imm)\n#include <lsxintrin.h>\nInstruction: vmini.du vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nCompute elementwise minimum for unsigned 64-bit elements in `a` and `imm`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = min((u64)a.dword[i], (u64)imm);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmini_du (__m128i a, imm0_31 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmini.du vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute elementwise minimum for unsigned 64-bit elements in <code>a</code> and <code>imm</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = min((u64)a.dword[i], (u64)imm);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmini_h (__m128i a, imm_n16_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmini_h (__m128i a, imm_n16_15 imm)\n#include <lsxintrin.h>\nInstruction: vmini.h vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nCompute elementwise minimum for signed 16-bit elements in `a` and `imm`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = min((s16)a.half[i], (s16)imm);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmini_h (__m128i a, imm_n16_15 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmini.h vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute elementwise minimum for signed 16-bit elements in <code>a</code> and <code>imm</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = min((s16)a.half[i], (s16)imm);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmini_hu (__m128i a, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmini_hu (__m128i a, imm0_31 imm)\n#include <lsxintrin.h>\nInstruction: vmini.hu vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nCompute elementwise minimum for unsigned 16-bit elements in `a` and `imm`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = min((u16)a.half[i], (u16)imm);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmini_hu (__m128i a, imm0_31 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmini.hu vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute elementwise minimum for unsigned 16-bit elements in <code>a</code> and <code>imm</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = min((u16)a.half[i], (u16)imm);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmini_w (__m128i a, imm_n16_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmini_w (__m128i a, imm_n16_15 imm)\n#include <lsxintrin.h>\nInstruction: vmini.w vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nCompute elementwise minimum for signed 32-bit elements in `a` and `imm`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = min((s32)a.word[i], (s32)imm);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmini_w (__m128i a, imm_n16_15 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmini.w vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute elementwise minimum for signed 32-bit elements in <code>a</code> and <code>imm</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = min((s32)a.word[i], (s32)imm);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmini_wu (__m128i a, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmini_wu (__m128i a, imm0_31 imm)\n#include <lsxintrin.h>\nInstruction: vmini.wu vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nCompute elementwise minimum for unsigned 32-bit elements in `a` and `imm`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = min((u32)a.word[i], (u32)imm);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmini_wu (__m128i a, imm0_31 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmini.wu vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute elementwise minimum for unsigned 32-bit elements in <code>a</code> and <code>imm</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = min((u32)a.word[i], (u32)imm);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmod_b (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmod_b (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vmod.b vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nModulo residual signed 8-bit elements in `a` by elements in `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = (b.byte[i] == 0) ? 0 : ((s8)a.byte[i] % (s8)b.byte[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 29, 35 | 0.06(1/15.5) |\n| 3C5000 | 29, 33 | 0.06(1/17) |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmod_b (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmod.b vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Modulo residual signed 8-bit elements in <code>a</code> by elements in <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = (b.byte[i] == 0) ? 0 : ((s8)a.byte[i] % (s8)b.byte[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>29, 35</td>\n<td>0.06(1/15.5)</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>29, 33</td>\n<td>0.06(1/17)</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmod_bu (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmod_bu (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vmod.bu vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nModulo residual unsigned 8-bit elements in `a` by elements in `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = (b.byte[i] == 0) ? 0 : ((u8)a.byte[i] % (u8)b.byte[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 29, 37 | 0.06(1/17.5) |\n| 3C5000 | 29, 33 | 0.05(1/19) |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmod_bu (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmod.bu vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Modulo residual unsigned 8-bit elements in <code>a</code> by elements in <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = (b.byte[i] == 0) ? 0 : ((u8)a.byte[i] % (u8)b.byte[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>29, 37</td>\n<td>0.06(1/17.5)</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>29, 33</td>\n<td>0.05(1/19)</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmod_d (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmod_d (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vmod.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nModulo residual signed 64-bit elements in `a` by elements in `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = (b.dword[i] == 0) ? 0 : ((s64)a.dword[i] % (s64)b.dword[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 8, 10 | 0.25(1/4) |\n| 3C5000 | 8, 10 | 0.11(1/9.5) |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmod_d (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmod.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Modulo residual signed 64-bit elements in <code>a</code> by elements in <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = (b.dword[i] == 0) ? 0 : ((s64)a.dword[i] % (s64)b.dword[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>8, 10</td>\n<td>0.25(1/4)</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>8, 10</td>\n<td>0.11(1/9.5)</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmod_du (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmod_du (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vmod.du vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nModulo residual unsigned 64-bit elements in `a` by elements in `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = (b.dword[i] == 0) ? 0 : ((u64)a.dword[i] % (u64)b.dword[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 8, 10 | 0.25(1/4) |\n| 3C5000 | 8, 10 | 0.11(1/9.5) |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmod_du (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmod.du vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Modulo residual unsigned 64-bit elements in <code>a</code> by elements in <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = (b.dword[i] == 0) ? 0 : ((u64)a.dword[i] % (u64)b.dword[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>8, 10</td>\n<td>0.25(1/4)</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>8, 10</td>\n<td>0.11(1/9.5)</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmod_h (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmod_h (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vmod.h vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nModulo residual signed 16-bit elements in `a` by elements in `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = (b.half[i] == 0) ? 0 : ((s16)a.half[i] % (s16)b.half[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 17, 21 | 0.12(1/8.5) |\n| 3C5000 | 17, 21 | 0.09(1/11) |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmod_h (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmod.h vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Modulo residual signed 16-bit elements in <code>a</code> by elements in <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = (b.half[i] == 0) ? 0 : ((s16)a.half[i] % (s16)b.half[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>17, 21</td>\n<td>0.12(1/8.5)</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>17, 21</td>\n<td>0.09(1/11)</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmod_hu (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmod_hu (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vmod.hu vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nModulo residual unsigned 16-bit elements in `a` by elements in `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = (b.half[i] == 0) ? 0 : ((u16)a.half[i] % (u16)b.half[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 17, 21 | 0.11(1/9.5) |\n| 3C5000 | 17, 21 | 0.07(1/15) |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmod_hu (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmod.hu vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Modulo residual unsigned 16-bit elements in <code>a</code> by elements in <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = (b.half[i] == 0) ? 0 : ((u16)a.half[i] % (u16)b.half[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>17, 21</td>\n<td>0.11(1/9.5)</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>17, 21</td>\n<td>0.07(1/15)</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmod_w (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmod_w (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vmod.w vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nModulo residual signed 32-bit elements in `a` by elements in `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = (b.word[i] == 0) ? 0 : ((s32)a.word[i] % (s32)b.word[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 11, 13 | 0.18(1/5.5) |\n| 3C5000 | 11, 15 | 0.08(1/12) |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmod_w (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmod.w vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Modulo residual signed 32-bit elements in <code>a</code> by elements in <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = (b.word[i] == 0) ? 0 : ((s32)a.word[i] % (s32)b.word[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>11, 13</td>\n<td>0.18(1/5.5)</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>11, 15</td>\n<td>0.08(1/12)</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmod_wu (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmod_wu (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vmod.wu vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nModulo residual unsigned 32-bit elements in `a` by elements in `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = (b.word[i] == 0) ? 0 : ((u32)a.word[i] % (u32)b.word[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 11, 13 | 0.18(1/5.5) |\n| 3C5000 | 11, 15 | 0.06(1/16) |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmod_wu (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmod.wu vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Modulo residual unsigned 32-bit elements in <code>a</code> by elements in <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = (b.word[i] == 0) ? 0 : ((u32)a.word[i] % (u32)b.word[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>11, 13</td>\n<td>0.18(1/5.5)</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>11, 15</td>\n<td>0.06(1/16)</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmskgez_b (__m128i a)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmskgez_b (__m128i a)\n#include <lsxintrin.h>\nInstruction: vmskgez.b vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nFor each 8-bit element in `a`, if the element is greater than or equal to zero, set one bit in `dst`, otherwise clear it.\n\n\n\n\n### Examples\n\n```c++\n__m128i __lsx_vmskgez_b(__m128i{0x1122334455667788, 0x99aabbccddeeff00})\n= 0x00000000000001fe 0x0000000000000000\n__m128i __lsx_vmskgez_b(__m128i{0x0000808000000000, 0x0081000081716151})\n= 0x000000000000b7cf 0x0000000000000000\n```\n\n\n### Operation\n\n```c++\nu64 m = 0x8080808080808080;\nu64 c = m & a.dword[0];\nc |= c << 7;\nc |= c << 14;\nc |= c << 28;\nc >>= 56;\ndst.dword[0] = c;\nc = m & a.dword[1];\nc |= c << 7;\nc |= c << 14;\nc |= c << 28;\nc >>= 56;\ndst.dword[0] |= c << 8;\ndst.dword[0] = (u16)~dst.dword[0];\ndst.dword[1] = 0;\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmskgez_b (__m128i a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmskgez.b vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>For each 8-bit element in <code>a</code>, if the element is greater than or equal to zero, set one bit in <code>dst</code>, otherwise clear it.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmskgez_b(__m128i{0x1122334455667788, 0x99aabbccddeeff00})\n= 0x00000000000001fe 0x0000000000000000\n__m128i __lsx_vmskgez_b(__m128i{0x0000808000000000, 0x0081000081716151})\n= 0x000000000000b7cf 0x0000000000000000\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">u64 m = 0x8080808080808080;\nu64 c = m &amp; a.dword[0];\nc |= c &lt;&lt; 7;\nc |= c &lt;&lt; 14;\nc |= c &lt;&lt; 28;\nc &gt;&gt;= 56;\ndst.dword[0] = c;\nc = m &amp; a.dword[1];\nc |= c &lt;&lt; 7;\nc |= c &lt;&lt; 14;\nc |= c &lt;&lt; 28;\nc &gt;&gt;= 56;\ndst.dword[0] |= c &lt;&lt; 8;\ndst.dword[0] = (u16)~dst.dword[0];\ndst.dword[1] = 0;\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmskltz_b (__m128i a)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmskltz_b (__m128i a)\n#include <lsxintrin.h>\nInstruction: vmskltz.b vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nFor each 8-bit element in `a`, if the element is less than zero, set one bit in `dst`, otherwise clear it.\n\n\n\n\n### Examples\n\n```c++\n__m128i __lsx_vmskltz_b(__m128i{0x1122334455667788, 0x99aabbccddeeff00})\n= 0x000000000000fe01 0x0000000000000000\n__m128i __lsx_vmskltz_b(__m128i{0x0000808000000000, 0x0081000081716151})\n= 0x0000000000004830 0x0000000000000000\n```\n\n\n### Operation\n\n```c++\nu64 m = 0x8080808080808080;\nu64 c = m & a.dword[0];\nc |= c << 7;\nc |= c << 14;\nc |= c << 28;\nc >>= 56;\ndst.dword[0] = c;\nc = m & a.dword[1];\nc |= c << 7;\nc |= c << 14;\nc |= c << 28;\nc >>= 56;\ndst.dword[0] |= c << 8;\ndst.dword[1] = 0;\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmskltz_b (__m128i a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmskltz.b vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>For each 8-bit element in <code>a</code>, if the element is less than zero, set one bit in <code>dst</code>, otherwise clear it.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmskltz_b(__m128i{0x1122334455667788, 0x99aabbccddeeff00})\n= 0x000000000000fe01 0x0000000000000000\n__m128i __lsx_vmskltz_b(__m128i{0x0000808000000000, 0x0081000081716151})\n= 0x0000000000004830 0x0000000000000000\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">u64 m = 0x8080808080808080;\nu64 c = m &amp; a.dword[0];\nc |= c &lt;&lt; 7;\nc |= c &lt;&lt; 14;\nc |= c &lt;&lt; 28;\nc &gt;&gt;= 56;\ndst.dword[0] = c;\nc = m &amp; a.dword[1];\nc |= c &lt;&lt; 7;\nc |= c &lt;&lt; 14;\nc |= c &lt;&lt; 28;\nc &gt;&gt;= 56;\ndst.dword[0] |= c &lt;&lt; 8;\ndst.dword[1] = 0;\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmskltz_d (__m128i a)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmskltz_d (__m128i a)\n#include <lsxintrin.h>\nInstruction: vmskltz.d vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nFor each 64-bit element in `a`, if the element is less than zero, set one bit in `dst`, otherwise clear it.\n\n\n\n\n### Examples\n\n```c++\n__m128i __lsx_vmskltz_d(__m128i{0x1122334455667788, 0x99aabbccddeeff00})\n= 0x0000000000000002 0x0000000000000000\n__m128i __lsx_vmskltz_d(__m128i{0x0000808000000000, 0x0081000081716151})\n= 0x0000000000000000 0x0000000000000000\n```\n\n\n### Operation\n\n```c++\nu64 m = 0x8000000000000000;\nu64 c = m & a.dword[0];\nc >>= 63;\ndst.dword[0] = c;\nc = m & a.dword[1];\nc >>= 63;\ndst.dword[0] |= c << 1;\ndst.dword[1] = 0;\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmskltz_d (__m128i a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmskltz.d vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>For each 64-bit element in <code>a</code>, if the element is less than zero, set one bit in <code>dst</code>, otherwise clear it.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmskltz_d(__m128i{0x1122334455667788, 0x99aabbccddeeff00})\n= 0x0000000000000002 0x0000000000000000\n__m128i __lsx_vmskltz_d(__m128i{0x0000808000000000, 0x0081000081716151})\n= 0x0000000000000000 0x0000000000000000\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">u64 m = 0x8000000000000000;\nu64 c = m &amp; a.dword[0];\nc &gt;&gt;= 63;\ndst.dword[0] = c;\nc = m &amp; a.dword[1];\nc &gt;&gt;= 63;\ndst.dword[0] |= c &lt;&lt; 1;\ndst.dword[1] = 0;\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmskltz_h (__m128i a)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmskltz_h (__m128i a)\n#include <lsxintrin.h>\nInstruction: vmskltz.h vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nFor each 16-bit element in `a`, if the element is less than zero, set one bit in `dst`, otherwise clear it.\n\n\n\n\n### Examples\n\n```c++\n__m128i __lsx_vmskltz_h(__m128i{0x1122334455667788, 0x99aabbccddeeff00})\n= 0x00000000000000f0 0x0000000000000000\n__m128i __lsx_vmskltz_h(__m128i{0x0000808000000000, 0x0081000081716151})\n= 0x0000000000000024 0x0000000000000000\n```\n\n\n### Operation\n\n```c++\nu64 m = 0x8000800080008000;\nu64 c = m & a.dword[0];\nc |= c << 15;\nc |= c << 30;\nc >>= 60;\ndst.dword[0] = c;\nc = m & a.dword[1];\nc |= c << 15;\nc |= c << 30;\nc >>= 60;\ndst.dword[0] |= c << 4;\ndst.dword[1] = 0;\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmskltz_h (__m128i a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmskltz.h vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>For each 16-bit element in <code>a</code>, if the element is less than zero, set one bit in <code>dst</code>, otherwise clear it.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmskltz_h(__m128i{0x1122334455667788, 0x99aabbccddeeff00})\n= 0x00000000000000f0 0x0000000000000000\n__m128i __lsx_vmskltz_h(__m128i{0x0000808000000000, 0x0081000081716151})\n= 0x0000000000000024 0x0000000000000000\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">u64 m = 0x8000800080008000;\nu64 c = m &amp; a.dword[0];\nc |= c &lt;&lt; 15;\nc |= c &lt;&lt; 30;\nc &gt;&gt;= 60;\ndst.dword[0] = c;\nc = m &amp; a.dword[1];\nc |= c &lt;&lt; 15;\nc |= c &lt;&lt; 30;\nc &gt;&gt;= 60;\ndst.dword[0] |= c &lt;&lt; 4;\ndst.dword[1] = 0;\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmskltz_w (__m128i a)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmskltz_w (__m128i a)\n#include <lsxintrin.h>\nInstruction: vmskltz.w vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nFor each 32-bit element in `a`, if the element is less than zero, set one bit in `dst`, otherwise clear it.\n\n\n\n\n### Examples\n\n```c++\n__m128i __lsx_vmskltz_w(__m128i{0x1122334455667788, 0x99aabbccddeeff00})\n= 0x000000000000000c 0x0000000000000000\n__m128i __lsx_vmskltz_w(__m128i{0x0000808000000000, 0x0081000081716151})\n= 0x0000000000000004 0x0000000000000000\n```\n\n\n### Operation\n\n```c++\nu64 m = 0x8000000080000000;\nu64 c = m & a.dword[0];\nc |= c << 31;\nc >>= 62;\ndst.dword[0] = c;\nc = m & a.dword[1];\nc |= c << 31;\nc >>= 62;\ndst.dword[0] |= c << 2;\ndst.dword[1] = 0;\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmskltz_w (__m128i a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmskltz.w vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>For each 32-bit element in <code>a</code>, if the element is less than zero, set one bit in <code>dst</code>, otherwise clear it.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmskltz_w(__m128i{0x1122334455667788, 0x99aabbccddeeff00})\n= 0x000000000000000c 0x0000000000000000\n__m128i __lsx_vmskltz_w(__m128i{0x0000808000000000, 0x0081000081716151})\n= 0x0000000000000004 0x0000000000000000\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">u64 m = 0x8000000080000000;\nu64 c = m &amp; a.dword[0];\nc |= c &lt;&lt; 31;\nc &gt;&gt;= 62;\ndst.dword[0] = c;\nc = m &amp; a.dword[1];\nc |= c &lt;&lt; 31;\nc &gt;&gt;= 62;\ndst.dword[0] |= c &lt;&lt; 2;\ndst.dword[1] = 0;\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmsknz_b (__m128i a)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmsknz_b (__m128i a)\n#include <lsxintrin.h>\nInstruction: vmsknz.b vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nFor each 8-bit element in `a`, if the element is non-zero, set one bit in `dst`, otherwise clear it.\n\n\n\n\n### Examples\n\n```c++\n__m128i __lsx_vmsknz_b(__m128i{0x1122334455667788, 0x99aabbccddeeff00})\n= 0x000000000000feff 0x0000000000000000\n__m128i __lsx_vmsknz_b(__m128i{0x0000111100000000, 0x0011000011111111})\n= 0x0000000000004f30 0x0000000000000000\n```\n\n\n### Operation\n\n```c++\nu64 m = 0x7F7F7F7F7F7F7F7F;\nu64 c = ~(((a.dword[0] & m) + m) | a.dword[0] | m);\nc |= c << 7;\nc |= c << 14;\nc |= c << 28;\nc >>= 56;\ndst.dword[0] = c;\nc = ~(((a.dword[1] & m) + m) | a.dword[1] | m);\nc |= c << 7;\nc |= c << 14;\nc |= c << 28;\nc >>= 56;\ndst.dword[0] |= c << 8;\ndst.dword[0] = (u16)~dst.dword[0];\ndst.dword[1] = 0;\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmsknz_b (__m128i a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmsknz.b vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>For each 8-bit element in <code>a</code>, if the element is non-zero, set one bit in <code>dst</code>, otherwise clear it.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmsknz_b(__m128i{0x1122334455667788, 0x99aabbccddeeff00})\n= 0x000000000000feff 0x0000000000000000\n__m128i __lsx_vmsknz_b(__m128i{0x0000111100000000, 0x0011000011111111})\n= 0x0000000000004f30 0x0000000000000000\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">u64 m = 0x7F7F7F7F7F7F7F7F;\nu64 c = ~(((a.dword[0] &amp; m) + m) | a.dword[0] | m);\nc |= c &lt;&lt; 7;\nc |= c &lt;&lt; 14;\nc |= c &lt;&lt; 28;\nc &gt;&gt;= 56;\ndst.dword[0] = c;\nc = ~(((a.dword[1] &amp; m) + m) | a.dword[1] | m);\nc |= c &lt;&lt; 7;\nc |= c &lt;&lt; 14;\nc |= c &lt;&lt; 28;\nc &gt;&gt;= 56;\ndst.dword[0] |= c &lt;&lt; 8;\ndst.dword[0] = (u16)~dst.dword[0];\ndst.dword[1] = 0;\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmsub_b (__m128i a, __m128i b, __m128i c)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmsub_b (__m128i a, __m128i b, __m128i c)\n#include <lsxintrin.h>\nInstruction: vmsub.b vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nMultiply 8-bit elements in `b` and `c`, negate and add elements in `a`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = -b.byte[i] * c.byte[i] + a.byte[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmsub_b (__m128i a, __m128i b, __m128i c)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmsub.b vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply 8-bit elements in <code>b</code> and <code>c</code>, negate and add elements in <code>a</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = -b.byte[i] * c.byte[i] + a.byte[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmsub_d (__m128i a, __m128i b, __m128i c)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmsub_d (__m128i a, __m128i b, __m128i c)\n#include <lsxintrin.h>\nInstruction: vmsub.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nMultiply 64-bit elements in `b` and `c`, negate and add elements in `a`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = -b.dword[i] * c.dword[i] + a.dword[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmsub_d (__m128i a, __m128i b, __m128i c)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmsub.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply 64-bit elements in <code>b</code> and <code>c</code>, negate and add elements in <code>a</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = -b.dword[i] * c.dword[i] + a.dword[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmsub_h (__m128i a, __m128i b, __m128i c)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmsub_h (__m128i a, __m128i b, __m128i c)\n#include <lsxintrin.h>\nInstruction: vmsub.h vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nMultiply 16-bit elements in `b` and `c`, negate and add elements in `a`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = -b.half[i] * c.half[i] + a.half[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmsub_h (__m128i a, __m128i b, __m128i c)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmsub.h vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply 16-bit elements in <code>b</code> and <code>c</code>, negate and add elements in <code>a</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = -b.half[i] * c.half[i] + a.half[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmsub_w (__m128i a, __m128i b, __m128i c)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmsub_w (__m128i a, __m128i b, __m128i c)\n#include <lsxintrin.h>\nInstruction: vmsub.w vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nMultiply 32-bit elements in `b` and `c`, negate and add elements in `a`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = -b.word[i] * c.word[i] + a.word[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmsub_w (__m128i a, __m128i b, __m128i c)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmsub.w vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply 32-bit elements in <code>b</code> and <code>c</code>, negate and add elements in <code>a</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = -b.word[i] * c.word[i] + a.word[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmuh_b (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmuh_b (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vmuh.b vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nMultiply signed 8-bit elements in `a` and `b`, save the high 8-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = (((s16)(s8)a.byte[i] * (s16)(s8)b.byte[i])) >> 8;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmuh_b (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmuh.b vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply signed 8-bit elements in <code>a</code> and <code>b</code>, save the high 8-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = (((s16)(s8)a.byte[i] * (s16)(s8)b.byte[i])) &gt;&gt; 8;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmuh_bu (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmuh_bu (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vmuh.bu vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nMultiply unsigned 8-bit elements in `a` and `b`, save the high 8-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = (((u16)(u8)a.byte[i] * (u16)(u8)b.byte[i])) >> 8;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmuh_bu (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmuh.bu vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply unsigned 8-bit elements in <code>a</code> and <code>b</code>, save the high 8-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = (((u16)(u8)a.byte[i] * (u16)(u8)b.byte[i])) &gt;&gt; 8;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmuh_d (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmuh_d (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vmuh.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nMultiply signed 64-bit elements in `a` and `b`, save the high 64-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = (((s128)(s64)a.dword[i] * (s128)(s64)b.dword[i])) >> 64;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmuh_d (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmuh.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply signed 64-bit elements in <code>a</code> and <code>b</code>, save the high 64-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = (((s128)(s64)a.dword[i] * (s128)(s64)b.dword[i])) &gt;&gt; 64;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmuh_du (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmuh_du (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vmuh.du vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nMultiply unsigned 64-bit elements in `a` and `b`, save the high 64-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = (((u128)(u64)a.dword[i] * (u128)(u64)b.dword[i])) >> 64;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmuh_du (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmuh.du vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply unsigned 64-bit elements in <code>a</code> and <code>b</code>, save the high 64-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = (((u128)(u64)a.dword[i] * (u128)(u64)b.dword[i])) &gt;&gt; 64;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmuh_h (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmuh_h (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vmuh.h vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nMultiply signed 16-bit elements in `a` and `b`, save the high 16-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = (((s32)(s16)a.half[i] * (s32)(s16)b.half[i])) >> 16;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmuh_h (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmuh.h vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply signed 16-bit elements in <code>a</code> and <code>b</code>, save the high 16-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = (((s32)(s16)a.half[i] * (s32)(s16)b.half[i])) &gt;&gt; 16;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmuh_hu (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmuh_hu (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vmuh.hu vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nMultiply unsigned 16-bit elements in `a` and `b`, save the high 16-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = (((u32)(u16)a.half[i] * (u32)(u16)b.half[i])) >> 16;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmuh_hu (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmuh.hu vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply unsigned 16-bit elements in <code>a</code> and <code>b</code>, save the high 16-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = (((u32)(u16)a.half[i] * (u32)(u16)b.half[i])) &gt;&gt; 16;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmuh_w (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmuh_w (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vmuh.w vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nMultiply signed 32-bit elements in `a` and `b`, save the high 32-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = (((s64)(s32)a.word[i] * (s64)(s32)b.word[i])) >> 32;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmuh_w (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmuh.w vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply signed 32-bit elements in <code>a</code> and <code>b</code>, save the high 32-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = (((s64)(s32)a.word[i] * (s64)(s32)b.word[i])) &gt;&gt; 32;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmuh_wu (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmuh_wu (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vmuh.wu vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nMultiply unsigned 32-bit elements in `a` and `b`, save the high 32-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = (((u64)(u32)a.word[i] * (u64)(u32)b.word[i])) >> 32;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmuh_wu (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmuh.wu vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply unsigned 32-bit elements in <code>a</code> and <code>b</code>, save the high 32-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = (((u64)(u32)a.word[i] * (u64)(u32)b.word[i])) &gt;&gt; 32;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmul_b (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmul_b (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vmul.b vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nMultiply 8-bit elements in `a` and `b`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = a.byte[i] * b.byte[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmul_b (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmul.b vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply 8-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = a.byte[i] * b.byte[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmul_d (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmul_d (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vmul.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nMultiply 64-bit elements in `a` and `b`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = a.dword[i] * b.dword[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmul_d (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmul.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply 64-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = a.dword[i] * b.dword[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmul_h (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmul_h (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vmul.h vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nMultiply 16-bit elements in `a` and `b`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = a.half[i] * b.half[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmul_h (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmul.h vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply 16-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = a.half[i] * b.half[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmul_w (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmul_w (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vmul.w vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nMultiply 32-bit elements in `a` and `b`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = a.word[i] * b.word[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmul_w (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmul.w vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply 32-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = a.word[i] * b.word[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmulwev_d_w (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmulwev_d_w (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vmulwev.d.w vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nMultiply even-positioned signed 32-bit elements in `a` and signed elements in `b`, save the 64-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = (s64)(s32)a.word[2 * i] * (s64)(s32)b.word[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmulwev_d_w (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmulwev.d.w vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply even-positioned signed 32-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 64-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = (s64)(s32)a.word[2 * i] * (s64)(s32)b.word[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmulwev_d_wu (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmulwev_d_wu (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vmulwev.d.wu vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nMultiply even-positioned unsigned 32-bit elements in `a` and unsigned elements in `b`, save the 64-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = (u64)(u32)a.word[2 * i] * (u64)(u32)b.word[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmulwev_d_wu (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmulwev.d.wu vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply even-positioned unsigned 32-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 64-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = (u64)(u32)a.word[2 * i] * (u64)(u32)b.word[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmulwev_d_wu_w (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmulwev_d_wu_w (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vmulwev.d.wu.w vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nMultiply even-positioned unsigned 32-bit elements in `a` and signed elements in `b`, save the 64-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = (u64)(u32)a.word[2 * i] * (s64)(s32)b.word[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmulwev_d_wu_w (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmulwev.d.wu.w vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply even-positioned unsigned 32-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 64-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = (u64)(u32)a.word[2 * i] * (s64)(s32)b.word[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmulwev_h_b (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmulwev_h_b (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vmulwev.h.b vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nMultiply even-positioned signed 8-bit elements in `a` and signed elements in `b`, save the 16-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = (s16)(s8)a.byte[2 * i] * (s16)(s8)b.byte[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmulwev_h_b (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmulwev.h.b vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply even-positioned signed 8-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 16-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = (s16)(s8)a.byte[2 * i] * (s16)(s8)b.byte[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmulwev_h_bu (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmulwev_h_bu (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vmulwev.h.bu vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nMultiply even-positioned unsigned 8-bit elements in `a` and unsigned elements in `b`, save the 16-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = (u16)(u8)a.byte[2 * i] * (u16)(u8)b.byte[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmulwev_h_bu (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmulwev.h.bu vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply even-positioned unsigned 8-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 16-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = (u16)(u8)a.byte[2 * i] * (u16)(u8)b.byte[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmulwev_h_bu_b (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmulwev_h_bu_b (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vmulwev.h.bu.b vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nMultiply even-positioned unsigned 8-bit elements in `a` and signed elements in `b`, save the 16-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = (u16)(u8)a.byte[2 * i] * (s16)(s8)b.byte[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmulwev_h_bu_b (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmulwev.h.bu.b vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply even-positioned unsigned 8-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 16-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = (u16)(u8)a.byte[2 * i] * (s16)(s8)b.byte[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmulwev_q_d (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmulwev_q_d (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vmulwev.q.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nMultiply even-positioned signed 64-bit elements in `a` and signed elements in `b`, save the 128-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 1; i++) {\n  dst.qword[i] = (s128)(s64)a.dword[2 * i] * (s128)(s64)b.dword[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 7 | 2 |\n| 3C5000 | 7 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmulwev_q_d (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmulwev.q.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply even-positioned signed 64-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 128-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 1; i++) {\n  dst.qword[i] = (s128)(s64)a.dword[2 * i] * (s128)(s64)b.dword[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>7</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>7</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmulwev_q_du (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmulwev_q_du (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vmulwev.q.du vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nMultiply even-positioned unsigned 64-bit elements in `a` and unsigned elements in `b`, save the 128-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 1; i++) {\n  dst.qword[i] = (u128)(u64)a.dword[2 * i] * (u128)(u64)b.dword[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 7 | 2 |\n| 3C5000 | 7 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmulwev_q_du (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmulwev.q.du vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply even-positioned unsigned 64-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 128-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 1; i++) {\n  dst.qword[i] = (u128)(u64)a.dword[2 * i] * (u128)(u64)b.dword[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>7</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>7</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmulwev_q_du_d (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmulwev_q_du_d (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vmulwev.q.du.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nMultiply even-positioned unsigned 64-bit elements in `a` and signed elements in `b`, save the 128-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 1; i++) {\n  dst.qword[i] = (u128)(u64)a.dword[2 * i] * (s128)(s64)b.dword[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 7 | 2 |\n| 3C5000 | 7 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmulwev_q_du_d (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmulwev.q.du.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply even-positioned unsigned 64-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 128-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 1; i++) {\n  dst.qword[i] = (u128)(u64)a.dword[2 * i] * (s128)(s64)b.dword[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>7</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>7</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmulwev_w_h (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmulwev_w_h (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vmulwev.w.h vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nMultiply even-positioned signed 16-bit elements in `a` and signed elements in `b`, save the 32-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = (s32)(s16)a.half[2 * i] * (s32)(s16)b.half[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmulwev_w_h (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmulwev.w.h vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply even-positioned signed 16-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 32-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = (s32)(s16)a.half[2 * i] * (s32)(s16)b.half[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmulwev_w_hu (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmulwev_w_hu (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vmulwev.w.hu vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nMultiply even-positioned unsigned 16-bit elements in `a` and unsigned elements in `b`, save the 32-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = (u32)(u16)a.half[2 * i] * (u32)(u16)b.half[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmulwev_w_hu (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmulwev.w.hu vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply even-positioned unsigned 16-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 32-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = (u32)(u16)a.half[2 * i] * (u32)(u16)b.half[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmulwev_w_hu_h (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmulwev_w_hu_h (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vmulwev.w.hu.h vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nMultiply even-positioned unsigned 16-bit elements in `a` and signed elements in `b`, save the 32-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = (u32)(u16)a.half[2 * i] * (s32)(s16)b.half[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmulwev_w_hu_h (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmulwev.w.hu.h vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply even-positioned unsigned 16-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 32-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = (u32)(u16)a.half[2 * i] * (s32)(s16)b.half[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmulwod_d_w (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmulwod_d_w (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vmulwod.d.w vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nMultiply odd-positioned signed 32-bit elements in `a` and signed elements in `b`, save the 64-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = (s64)(s32)a.word[2 * i + 1] * (s64)(s32)b.word[2 * i + 1];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmulwod_d_w (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmulwod.d.w vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply odd-positioned signed 32-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 64-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = (s64)(s32)a.word[2 * i + 1] * (s64)(s32)b.word[2 * i + 1];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmulwod_d_wu (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmulwod_d_wu (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vmulwod.d.wu vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nMultiply odd-positioned unsigned 32-bit elements in `a` and unsigned elements in `b`, save the 64-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = (u64)(u32)a.word[2 * i + 1] * (u64)(u32)b.word[2 * i + 1];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmulwod_d_wu (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmulwod.d.wu vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply odd-positioned unsigned 32-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 64-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = (u64)(u32)a.word[2 * i + 1] * (u64)(u32)b.word[2 * i + 1];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmulwod_d_wu_w (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmulwod_d_wu_w (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vmulwod.d.wu.w vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nMultiply odd-positioned unsigned 32-bit elements in `a` and signed elements in `b`, save the 64-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = (u64)(u32)a.word[2 * i + 1] * (s64)(s32)b.word[2 * i + 1];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmulwod_d_wu_w (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmulwod.d.wu.w vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply odd-positioned unsigned 32-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 64-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = (u64)(u32)a.word[2 * i + 1] * (s64)(s32)b.word[2 * i + 1];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmulwod_h_b (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmulwod_h_b (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vmulwod.h.b vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nMultiply odd-positioned signed 8-bit elements in `a` and signed elements in `b`, save the 16-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = (s16)(s8)a.byte[2 * i + 1] * (s16)(s8)b.byte[2 * i + 1];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmulwod_h_b (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmulwod.h.b vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply odd-positioned signed 8-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 16-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = (s16)(s8)a.byte[2 * i + 1] * (s16)(s8)b.byte[2 * i + 1];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmulwod_h_bu (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmulwod_h_bu (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vmulwod.h.bu vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nMultiply odd-positioned unsigned 8-bit elements in `a` and unsigned elements in `b`, save the 16-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = (u16)(u8)a.byte[2 * i + 1] * (u16)(u8)b.byte[2 * i + 1];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmulwod_h_bu (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmulwod.h.bu vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply odd-positioned unsigned 8-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 16-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = (u16)(u8)a.byte[2 * i + 1] * (u16)(u8)b.byte[2 * i + 1];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmulwod_h_bu_b (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmulwod_h_bu_b (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vmulwod.h.bu.b vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nMultiply odd-positioned unsigned 8-bit elements in `a` and signed elements in `b`, save the 16-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = (u16)(u8)a.byte[2 * i + 1] * (s16)(s8)b.byte[2 * i + 1];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmulwod_h_bu_b (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmulwod.h.bu.b vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply odd-positioned unsigned 8-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 16-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = (u16)(u8)a.byte[2 * i + 1] * (s16)(s8)b.byte[2 * i + 1];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmulwod_q_d (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmulwod_q_d (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vmulwod.q.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nMultiply odd-positioned signed 64-bit elements in `a` and signed elements in `b`, save the 128-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 1; i++) {\n  dst.qword[i] = (s128)(s64)a.dword[2 * i + 1] * (s128)(s64)b.dword[2 * i + 1];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 7 | 2 |\n| 3C5000 | 7 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmulwod_q_d (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmulwod.q.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply odd-positioned signed 64-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 128-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 1; i++) {\n  dst.qword[i] = (s128)(s64)a.dword[2 * i + 1] * (s128)(s64)b.dword[2 * i + 1];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>7</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>7</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmulwod_q_du (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmulwod_q_du (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vmulwod.q.du vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nMultiply odd-positioned unsigned 64-bit elements in `a` and unsigned elements in `b`, save the 128-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 1; i++) {\n  dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] * (u128)(u64)b.dword[2 * i + 1];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 7 | 2 |\n| 3C5000 | 7 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmulwod_q_du (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmulwod.q.du vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply odd-positioned unsigned 64-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 128-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 1; i++) {\n  dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] * (u128)(u64)b.dword[2 * i + 1];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>7</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>7</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmulwod_q_du_d (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmulwod_q_du_d (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vmulwod.q.du.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nMultiply odd-positioned unsigned 64-bit elements in `a` and signed elements in `b`, save the 128-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 1; i++) {\n  dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] * (s128)(s64)b.dword[2 * i + 1];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 7 | 2 |\n| 3C5000 | 7 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmulwod_q_du_d (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmulwod.q.du.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply odd-positioned unsigned 64-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 128-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 1; i++) {\n  dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] * (s128)(s64)b.dword[2 * i + 1];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>7</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>7</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmulwod_w_h (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmulwod_w_h (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vmulwod.w.h vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nMultiply odd-positioned signed 16-bit elements in `a` and signed elements in `b`, save the 32-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = (s32)(s16)a.half[2 * i + 1] * (s32)(s16)b.half[2 * i + 1];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmulwod_w_h (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmulwod.w.h vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply odd-positioned signed 16-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 32-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = (s32)(s16)a.half[2 * i + 1] * (s32)(s16)b.half[2 * i + 1];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmulwod_w_hu (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmulwod_w_hu (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vmulwod.w.hu vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nMultiply odd-positioned unsigned 16-bit elements in `a` and unsigned elements in `b`, save the 32-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = (u32)(u16)a.half[2 * i + 1] * (u32)(u16)b.half[2 * i + 1];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmulwod_w_hu (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmulwod.w.hu vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply odd-positioned unsigned 16-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 32-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = (u32)(u16)a.half[2 * i + 1] * (u32)(u16)b.half[2 * i + 1];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vmulwod_w_hu_h (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vmulwod_w_hu_h (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vmulwod.w.hu.h vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nMultiply odd-positioned unsigned 16-bit elements in `a` and signed elements in `b`, save the 32-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = (u32)(u16)a.half[2 * i + 1] * (s32)(s16)b.half[2 * i + 1];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vmulwod_w_hu_h (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vmulwod.w.hu.h vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply odd-positioned unsigned 16-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 32-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = (u32)(u16)a.half[2 * i + 1] * (s32)(s16)b.half[2 * i + 1];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vneg_b (__m128i a)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vneg_b (__m128i a)\n#include <lsxintrin.h>\nInstruction: vneg.b vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nNegate 8-bit elements in `a` and save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = -a.byte[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vneg_b (__m128i a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vneg.b vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Negate 8-bit elements in <code>a</code> and save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = -a.byte[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vneg_d (__m128i a)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vneg_d (__m128i a)\n#include <lsxintrin.h>\nInstruction: vneg.d vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nNegate 64-bit elements in `a` and save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = -a.dword[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vneg_d (__m128i a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vneg.d vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Negate 64-bit elements in <code>a</code> and save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = -a.dword[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vneg_h (__m128i a)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vneg_h (__m128i a)\n#include <lsxintrin.h>\nInstruction: vneg.h vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nNegate 16-bit elements in `a` and save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = -a.half[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vneg_h (__m128i a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vneg.h vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Negate 16-bit elements in <code>a</code> and save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = -a.half[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vneg_w (__m128i a)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vneg_w (__m128i a)\n#include <lsxintrin.h>\nInstruction: vneg.w vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nNegate 32-bit elements in `a` and save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = -a.word[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vneg_w (__m128i a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vneg.w vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Negate 32-bit elements in <code>a</code> and save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = -a.word[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vnor_v (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vnor_v (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vnor.v vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompute bitwise NOR between elements in `a` and `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = ~(a.dword[i] | b.dword[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vnor_v (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vnor.v vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute bitwise NOR between elements in <code>a</code> and <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = ~(a.dword[i] | b.dword[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Logical", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vnori_b (__m128i a, imm0_255 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vnori_b (__m128i a, imm0_255 imm)\n#include <lsxintrin.h>\nInstruction: vnori.b vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nCompute bitwise NOR between elements in `a` and `imm`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = ~(a.byte[i] | imm);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vnori_b (__m128i a, imm0_255 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vnori.b vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute bitwise NOR between elements in <code>a</code> and <code>imm</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = ~(a.byte[i] | imm);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Logical", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vor_v (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vor_v (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vor.v vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompute bitwise OR between elements in `a` and `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = a.dword[i] | b.dword[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vor_v (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vor.v vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute bitwise OR between elements in <code>a</code> and <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = a.dword[i] | b.dword[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Logical", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vori_b (__m128i a, imm0_255 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vori_b (__m128i a, imm0_255 imm)\n#include <lsxintrin.h>\nInstruction: vori.b vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nCompute bitwise OR between elements in `a` and `imm`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = a.byte[i] | imm;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vori_b (__m128i a, imm0_255 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vori.b vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute bitwise OR between elements in <code>a</code> and <code>imm</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = a.byte[i] | imm;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Logical", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vorn_v (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vorn_v (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vorn.v vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompute bitwise ORN between elements in `a` and `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = a.dword[i] | (~b.dword[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vorn_v (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vorn.v vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute bitwise ORN between elements in <code>a</code> and <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = a.dword[i] | (~b.dword[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Logical", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vpackev_b (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vpackev_b (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vpackev.b vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCollect and pack even-positioned 8-bit elements in `a` and `b` and store `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = (i % 2 == 1) ? a.byte[i - 1] : b.byte[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vpackev_b (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vpackev.b vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Collect and pack even-positioned 8-bit elements in <code>a</code> and <code>b</code> and store <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = (i % 2 == 1) ? a.byte[i - 1] : b.byte[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vpackev_d (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vpackev_d (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vpackev.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCollect and pack even-positioned 64-bit elements in `a` and `b` and store `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = (i % 2 == 1) ? a.dword[i - 1] : b.dword[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vpackev_d (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vpackev.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Collect and pack even-positioned 64-bit elements in <code>a</code> and <code>b</code> and store <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = (i % 2 == 1) ? a.dword[i - 1] : b.dword[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vpackev_h (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vpackev_h (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vpackev.h vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCollect and pack even-positioned 16-bit elements in `a` and `b` and store `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = (i % 2 == 1) ? a.half[i - 1] : b.half[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vpackev_h (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vpackev.h vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Collect and pack even-positioned 16-bit elements in <code>a</code> and <code>b</code> and store <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = (i % 2 == 1) ? a.half[i - 1] : b.half[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vpackev_w (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vpackev_w (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vpackev.w vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCollect and pack even-positioned 32-bit elements in `a` and `b` and store `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = (i % 2 == 1) ? a.word[i - 1] : b.word[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vpackev_w (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vpackev.w vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Collect and pack even-positioned 32-bit elements in <code>a</code> and <code>b</code> and store <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = (i % 2 == 1) ? a.word[i - 1] : b.word[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vpackod_b (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vpackod_b (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vpackod.b vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCollect and pack odd-positioned 8-bit elements in `a` and `b` and store `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = (i % 2 == 1) ? a.byte[i] : b.byte[i + 1];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vpackod_b (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vpackod.b vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Collect and pack odd-positioned 8-bit elements in <code>a</code> and <code>b</code> and store <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = (i % 2 == 1) ? a.byte[i] : b.byte[i + 1];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vpackod_d (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vpackod_d (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vpackod.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCollect and pack odd-positioned 64-bit elements in `a` and `b` and store `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = (i % 2 == 1) ? a.dword[i] : b.dword[i + 1];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vpackod_d (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vpackod.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Collect and pack odd-positioned 64-bit elements in <code>a</code> and <code>b</code> and store <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = (i % 2 == 1) ? a.dword[i] : b.dword[i + 1];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vpackod_h (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vpackod_h (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vpackod.h vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCollect and pack odd-positioned 16-bit elements in `a` and `b` and store `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = (i % 2 == 1) ? a.half[i] : b.half[i + 1];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vpackod_h (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vpackod.h vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Collect and pack odd-positioned 16-bit elements in <code>a</code> and <code>b</code> and store <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = (i % 2 == 1) ? a.half[i] : b.half[i + 1];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vpackod_w (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vpackod_w (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vpackod.w vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCollect and pack odd-positioned 32-bit elements in `a` and `b` and store `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = (i % 2 == 1) ? a.word[i] : b.word[i + 1];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vpackod_w (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vpackod.w vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Collect and pack odd-positioned 32-bit elements in <code>a</code> and <code>b</code> and store <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = (i % 2 == 1) ? a.word[i] : b.word[i + 1];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vpcnt_b (__m128i a)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vpcnt_b (__m128i a)\n#include <lsxintrin.h>\nInstruction: vpcnt.b vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCount the number of ones (population, popcount) in 8-bit elements in `a`.\n\n\n\n\n### Examples\n\n```c++\n__m128i __lsx_vpcnt_b(__m128i{0x1122334455667788, 0x99aabbccddeeff00})\n= 0x0202040204040602 0x0404060406060800\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = popcount(a.byte[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vpcnt_b (__m128i a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vpcnt.b vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Count the number of ones (population, popcount) in 8-bit elements in <code>a</code>.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vpcnt_b(__m128i{0x1122334455667788, 0x99aabbccddeeff00})\n= 0x0202040204040602 0x0404060406060800\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = popcount(a.byte[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Bitwise Operations", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vpcnt_d (__m128i a)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vpcnt_d (__m128i a)\n#include <lsxintrin.h>\nInstruction: vpcnt.d vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCount the number of ones (population, popcount) in 64-bit elements in `a`.\n\n\n\n\n### Examples\n\n```c++\n__m128i __lsx_vpcnt_d(__m128i{0x1122334455667788, 0x99aabbccddeeff00})\n= 0x000000000000001a 0x0000000000000026\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = popcount(a.dword[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vpcnt_d (__m128i a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vpcnt.d vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Count the number of ones (population, popcount) in 64-bit elements in <code>a</code>.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vpcnt_d(__m128i{0x1122334455667788, 0x99aabbccddeeff00})\n= 0x000000000000001a 0x0000000000000026\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = popcount(a.dword[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Bitwise Operations", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vpcnt_h (__m128i a)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vpcnt_h (__m128i a)\n#include <lsxintrin.h>\nInstruction: vpcnt.h vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCount the number of ones (population, popcount) in 16-bit elements in `a`.\n\n\n\n\n### Examples\n\n```c++\n__m128i __lsx_vpcnt_h(__m128i{0x1122334455667788, 0x99aabbccddeeff00})\n= 0x0004000600080008 0x0008000a000c0008\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = popcount(a.half[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vpcnt_h (__m128i a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vpcnt.h vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Count the number of ones (population, popcount) in 16-bit elements in <code>a</code>.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vpcnt_h(__m128i{0x1122334455667788, 0x99aabbccddeeff00})\n= 0x0004000600080008 0x0008000a000c0008\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = popcount(a.half[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Bitwise Operations", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vpcnt_w (__m128i a)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vpcnt_w (__m128i a)\n#include <lsxintrin.h>\nInstruction: vpcnt.w vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCount the number of ones (population, popcount) in 32-bit elements in `a`.\n\n\n\n\n### Examples\n\n```c++\n__m128i __lsx_vpcnt_w(__m128i{0x1122334455667788, 0x99aabbccddeeff00})\n= 0x0000000a00000010 0x0000001200000014\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = popcount(a.word[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vpcnt_w (__m128i a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vpcnt.w vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Count the number of ones (population, popcount) in 32-bit elements in <code>a</code>.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vpcnt_w(__m128i{0x1122334455667788, 0x99aabbccddeeff00})\n= 0x0000000a00000010 0x0000001200000014\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = popcount(a.word[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Bitwise Operations", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vpermi_w (__m128i a, __m128i b, imm0_255 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vpermi_w (__m128i a, __m128i b, imm0_255 imm)\n#include <lsxintrin.h>\nInstruction: vpermi.w vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nPermute words from `a` and `b` with indices recorded in `imm` and store into `dst`.\n\n\n\n\n\n### Operation\n\n```c++\ndst.word[0] = b.word[imm & 0x3];\ndst.word[1] = b.word[(imm >> 2) & 0x3];\ndst.word[2] = a.word[(imm >> 4) & 0x3];\ndst.word[3] = a.word[(imm >> 6) & 0x3];\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vpermi_w (__m128i a, __m128i b, imm0_255 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vpermi.w vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Permute words from <code>a</code> and <code>b</code> with indices recorded in <code>imm</code> and store into <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">dst.word[0] = b.word[imm &amp; 0x3];\ndst.word[1] = b.word[(imm &gt;&gt; 2) &amp; 0x3];\ndst.word[2] = a.word[(imm &gt;&gt; 4) &amp; 0x3];\ndst.word[3] = a.word[(imm &gt;&gt; 6) &amp; 0x3];\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Permutation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vpickev_b (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vpickev_b (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vpickev.b vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nPick even-positioned 8-bit elements in `b` first, then pick even-positioned 8-bit elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = (i < 8) ? b.byte[i * 2] : a.byte[(i - 8) * 2];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vpickev_b (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vpickev.b vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Pick even-positioned 8-bit elements in <code>b</code> first, then pick even-positioned 8-bit elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = (i &lt; 8) ? b.byte[i * 2] : a.byte[(i - 8) * 2];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vpickev_d (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vpickev_d (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vpickev.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nPick even-positioned 64-bit elements in `b` first, then pick even-positioned 64-bit elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = (i < 1) ? b.dword[i * 2] : a.dword[(i - 1) * 2];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vpickev_d (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vpickev.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Pick even-positioned 64-bit elements in <code>b</code> first, then pick even-positioned 64-bit elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = (i &lt; 1) ? b.dword[i * 2] : a.dword[(i - 1) * 2];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vpickev_h (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vpickev_h (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vpickev.h vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nPick even-positioned 16-bit elements in `b` first, then pick even-positioned 16-bit elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = (i < 4) ? b.half[i * 2] : a.half[(i - 4) * 2];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vpickev_h (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vpickev.h vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Pick even-positioned 16-bit elements in <code>b</code> first, then pick even-positioned 16-bit elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = (i &lt; 4) ? b.half[i * 2] : a.half[(i - 4) * 2];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vpickev_w (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vpickev_w (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vpickev.w vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nPick even-positioned 32-bit elements in `b` first, then pick even-positioned 32-bit elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = (i < 2) ? b.word[i * 2] : a.word[(i - 2) * 2];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vpickev_w (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vpickev.w vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Pick even-positioned 32-bit elements in <code>b</code> first, then pick even-positioned 32-bit elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = (i &lt; 2) ? b.word[i * 2] : a.word[(i - 2) * 2];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vpickod_b (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vpickod_b (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vpickod.b vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nPick odd-positioned 8-bit elements in `b` first, then pick odd-positioned 8-bit elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = (i < 8) ? b.byte[i * 2 + 1] : a.byte[(i - 8) * 2 + 1];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vpickod_b (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vpickod.b vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Pick odd-positioned 8-bit elements in <code>b</code> first, then pick odd-positioned 8-bit elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = (i &lt; 8) ? b.byte[i * 2 + 1] : a.byte[(i - 8) * 2 + 1];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vpickod_d (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vpickod_d (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vpickod.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nPick odd-positioned 64-bit elements in `b` first, then pick odd-positioned 64-bit elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = (i < 1) ? b.dword[i * 2 + 1] : a.dword[(i - 1) * 2 + 1];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vpickod_d (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vpickod.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Pick odd-positioned 64-bit elements in <code>b</code> first, then pick odd-positioned 64-bit elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = (i &lt; 1) ? b.dword[i * 2 + 1] : a.dword[(i - 1) * 2 + 1];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vpickod_h (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vpickod_h (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vpickod.h vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nPick odd-positioned 16-bit elements in `b` first, then pick odd-positioned 16-bit elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = (i < 4) ? b.half[i * 2 + 1] : a.half[(i - 4) * 2 + 1];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vpickod_h (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vpickod.h vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Pick odd-positioned 16-bit elements in <code>b</code> first, then pick odd-positioned 16-bit elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = (i &lt; 4) ? b.half[i * 2 + 1] : a.half[(i - 4) * 2 + 1];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vpickod_w (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vpickod_w (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vpickod.w vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nPick odd-positioned 32-bit elements in `b` first, then pick odd-positioned 32-bit elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = (i < 2) ? b.word[i * 2 + 1] : a.word[(i - 2) * 2 + 1];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vpickod_w (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vpickod.w vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Pick odd-positioned 32-bit elements in <code>b</code> first, then pick odd-positioned 32-bit elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = (i &lt; 2) ? b.word[i * 2 + 1] : a.word[(i - 2) * 2 + 1];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vreplgr2vr_b (int val)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vreplgr2vr_b (int val)\n#include <lsxintrin.h>\nInstruction: vreplgr2vr.b vr, r\nCPU Flags: LSX\n```\n\n### Description\n\nRepeat `val` to whole vector.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = val;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | N/A | 1 |\n| 3C5000 | N/A | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vreplgr2vr_b (int val)\n#include &lt;lsxintrin.h&gt;\nInstruction: vreplgr2vr.b vr, r\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Repeat <code>val</code> to whole vector.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = val;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>N/A</td>\n<td>1</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>N/A</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vreplgr2vr_d (long int val)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vreplgr2vr_d (long int val)\n#include <lsxintrin.h>\nInstruction: vreplgr2vr.d vr, r\nCPU Flags: LSX\n```\n\n### Description\n\nRepeat `val` to whole vector.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = val;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | N/A | 1 |\n| 3C5000 | N/A | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vreplgr2vr_d (long int val)\n#include &lt;lsxintrin.h&gt;\nInstruction: vreplgr2vr.d vr, r\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Repeat <code>val</code> to whole vector.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = val;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>N/A</td>\n<td>1</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>N/A</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vreplgr2vr_h (int val)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vreplgr2vr_h (int val)\n#include <lsxintrin.h>\nInstruction: vreplgr2vr.h vr, r\nCPU Flags: LSX\n```\n\n### Description\n\nRepeat `val` to whole vector.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = val;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | N/A | 1 |\n| 3C5000 | N/A | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vreplgr2vr_h (int val)\n#include &lt;lsxintrin.h&gt;\nInstruction: vreplgr2vr.h vr, r\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Repeat <code>val</code> to whole vector.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = val;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>N/A</td>\n<td>1</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>N/A</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vreplgr2vr_w (int val)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vreplgr2vr_w (int val)\n#include <lsxintrin.h>\nInstruction: vreplgr2vr.w vr, r\nCPU Flags: LSX\n```\n\n### Description\n\nRepeat `val` to whole vector.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = val;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | N/A | 1 |\n| 3C5000 | N/A | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vreplgr2vr_w (int val)\n#include &lt;lsxintrin.h&gt;\nInstruction: vreplgr2vr.w vr, r\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Repeat <code>val</code> to whole vector.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = val;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>N/A</td>\n<td>1</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>N/A</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vrepli_b (imm_n512_511 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vrepli_b (imm_n512_511 imm)\n#include <lsxintrin.h>\nInstruction: vldi vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nRepeat `imm` to fill whole vector.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = imm;\n}\n```\n\nTested on real machine.", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vrepli_b (imm_n512_511 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vldi vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Repeat <code>imm</code> to fill whole vector.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = imm;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vrepli_d (imm_n512_511 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vrepli_d (imm_n512_511 imm)\n#include <lsxintrin.h>\nInstruction: vldi vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nRepeat `imm` to fill whole vector.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = imm;\n}\n```\n\nTested on real machine.", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vrepli_d (imm_n512_511 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vldi vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Repeat <code>imm</code> to fill whole vector.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = imm;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vrepli_h (imm_n512_511 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vrepli_h (imm_n512_511 imm)\n#include <lsxintrin.h>\nInstruction: vldi vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nRepeat `imm` to fill whole vector.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = imm;\n}\n```\n\nTested on real machine.", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vrepli_h (imm_n512_511 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vldi vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Repeat <code>imm</code> to fill whole vector.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = imm;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vrepli_w (imm_n512_511 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vrepli_w (imm_n512_511 imm)\n#include <lsxintrin.h>\nInstruction: vldi vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nRepeat `imm` to fill whole vector.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = imm;\n}\n```\n\nTested on real machine.", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vrepli_w (imm_n512_511 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vldi vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Repeat <code>imm</code> to fill whole vector.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = imm;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vreplve_b (__m128i a, int idx)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vreplve_b (__m128i a, int idx)\n#include <lsxintrin.h>\nInstruction: vreplve.b vr, vr, r\nCPU Flags: LSX\n```\n\n### Description\n\nRepeat the element in lane `idx` of `a` to fill whole vector.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = a.byte[idx % 16];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 1 |\n| 3C5000 | 1 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vreplve_b (__m128i a, int idx)\n#include &lt;lsxintrin.h&gt;\nInstruction: vreplve.b vr, vr, r\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Repeat the element in lane <code>idx</code> of <code>a</code> to fill whole vector.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = a.byte[idx % 16];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>1</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vreplve_d (__m128i a, int idx)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vreplve_d (__m128i a, int idx)\n#include <lsxintrin.h>\nInstruction: vreplve.d vr, vr, r\nCPU Flags: LSX\n```\n\n### Description\n\nRepeat the element in lane `idx` of `a` to fill whole vector.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = a.dword[idx % 2];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 1 |\n| 3C5000 | 1 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vreplve_d (__m128i a, int idx)\n#include &lt;lsxintrin.h&gt;\nInstruction: vreplve.d vr, vr, r\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Repeat the element in lane <code>idx</code> of <code>a</code> to fill whole vector.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = a.dword[idx % 2];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>1</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vreplve_h (__m128i a, int idx)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vreplve_h (__m128i a, int idx)\n#include <lsxintrin.h>\nInstruction: vreplve.h vr, vr, r\nCPU Flags: LSX\n```\n\n### Description\n\nRepeat the element in lane `idx` of `a` to fill whole vector.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = a.half[idx % 8];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 1 |\n| 3C5000 | 1 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vreplve_h (__m128i a, int idx)\n#include &lt;lsxintrin.h&gt;\nInstruction: vreplve.h vr, vr, r\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Repeat the element in lane <code>idx</code> of <code>a</code> to fill whole vector.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = a.half[idx % 8];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>1</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vreplve_w (__m128i a, int idx)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vreplve_w (__m128i a, int idx)\n#include <lsxintrin.h>\nInstruction: vreplve.w vr, vr, r\nCPU Flags: LSX\n```\n\n### Description\n\nRepeat the element in lane `idx` of `a` to fill whole vector.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = a.word[idx % 4];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 1 |\n| 3C5000 | 1 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vreplve_w (__m128i a, int idx)\n#include &lt;lsxintrin.h&gt;\nInstruction: vreplve.w vr, vr, r\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Repeat the element in lane <code>idx</code> of <code>a</code> to fill whole vector.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = a.word[idx % 4];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>1</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vreplvei_b (__m128i a, imm0_15 idx)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vreplvei_b (__m128i a, imm0_15 idx)\n#include <lsxintrin.h>\nInstruction: vreplvei.b vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nRepeat the element in lane `idx` of `a` to fill whole vector.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = a.byte[idx];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vreplvei_b (__m128i a, imm0_15 idx)\n#include &lt;lsxintrin.h&gt;\nInstruction: vreplvei.b vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Repeat the element in lane <code>idx</code> of <code>a</code> to fill whole vector.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = a.byte[idx];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vreplvei_d (__m128i a, imm0_1 idx)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vreplvei_d (__m128i a, imm0_1 idx)\n#include <lsxintrin.h>\nInstruction: vreplvei.d vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nRepeat the element in lane `idx` of `a` to fill whole vector.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = a.dword[idx];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vreplvei_d (__m128i a, imm0_1 idx)\n#include &lt;lsxintrin.h&gt;\nInstruction: vreplvei.d vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Repeat the element in lane <code>idx</code> of <code>a</code> to fill whole vector.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = a.dword[idx];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vreplvei_h (__m128i a, imm0_7 idx)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vreplvei_h (__m128i a, imm0_7 idx)\n#include <lsxintrin.h>\nInstruction: vreplvei.h vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nRepeat the element in lane `idx` of `a` to fill whole vector.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = a.half[idx];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vreplvei_h (__m128i a, imm0_7 idx)\n#include &lt;lsxintrin.h&gt;\nInstruction: vreplvei.h vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Repeat the element in lane <code>idx</code> of <code>a</code> to fill whole vector.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = a.half[idx];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vreplvei_w (__m128i a, imm0_3 idx)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vreplvei_w (__m128i a, imm0_3 idx)\n#include <lsxintrin.h>\nInstruction: vreplvei.w vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nRepeat the element in lane `idx` of `a` to fill whole vector.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = a.word[idx];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vreplvei_w (__m128i a, imm0_3 idx)\n#include &lt;lsxintrin.h&gt;\nInstruction: vreplvei.w vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Repeat the element in lane <code>idx</code> of <code>a</code> to fill whole vector.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = a.word[idx];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vrotr_b (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vrotr_b (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vrotr.b vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nRotate right the unsigned 8-bit elements in `a` by elements in `b`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] =\n      (a.byte[i] >> (b.byte[i] & 0x7)) | (a.byte[i] << (8 - (b.byte[i] & 0x7)));\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vrotr_b (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vrotr.b vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Rotate right the unsigned 8-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] =\n      (a.byte[i] &gt;&gt; (b.byte[i] &amp; 0x7)) | (a.byte[i] &lt;&lt; (8 - (b.byte[i] &amp; 0x7)));\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vrotr_d (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vrotr_d (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vrotr.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nRotate right the unsigned 64-bit elements in `a` by elements in `b`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = (a.dword[i] >> (b.dword[i] & 0x3f)) |\n                 (a.dword[i] << (64 - (b.dword[i] & 0x3f)));\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vrotr_d (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vrotr.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Rotate right the unsigned 64-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = (a.dword[i] &gt;&gt; (b.dword[i] &amp; 0x3f)) |\n                 (a.dword[i] &lt;&lt; (64 - (b.dword[i] &amp; 0x3f)));\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vrotr_h (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vrotr_h (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vrotr.h vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nRotate right the unsigned 16-bit elements in `a` by elements in `b`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = (a.half[i] >> (b.half[i] & 0xf)) |\n                (a.half[i] << (16 - (b.half[i] & 0xf)));\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vrotr_h (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vrotr.h vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Rotate right the unsigned 16-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = (a.half[i] &gt;&gt; (b.half[i] &amp; 0xf)) |\n                (a.half[i] &lt;&lt; (16 - (b.half[i] &amp; 0xf)));\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vrotr_w (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vrotr_w (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vrotr.w vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nRotate right the unsigned 32-bit elements in `a` by elements in `b`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = (a.word[i] >> (b.word[i] & 0x1f)) |\n                (a.word[i] << (32 - (b.word[i] & 0x1f)));\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vrotr_w (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vrotr.w vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Rotate right the unsigned 32-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = (a.word[i] &gt;&gt; (b.word[i] &amp; 0x1f)) |\n                (a.word[i] &lt;&lt; (32 - (b.word[i] &amp; 0x1f)));\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vrotri_b (__m128i a, imm0_7 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vrotri_b (__m128i a, imm0_7 imm)\n#include <lsxintrin.h>\nInstruction: vrotri.b vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nRotate right the unsigned 8-bit elements in `a` by `imm`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = (a.byte[i] >> imm) | (a.byte[i] << (8 - imm));\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vrotri_b (__m128i a, imm0_7 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vrotri.b vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Rotate right the unsigned 8-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = (a.byte[i] &gt;&gt; imm) | (a.byte[i] &lt;&lt; (8 - imm));\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vrotri_d (__m128i a, imm0_63 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vrotri_d (__m128i a, imm0_63 imm)\n#include <lsxintrin.h>\nInstruction: vrotri.d vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nRotate right the unsigned 64-bit elements in `a` by `imm`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = (a.dword[i] >> imm) | (a.dword[i] << (64 - imm));\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vrotri_d (__m128i a, imm0_63 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vrotri.d vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Rotate right the unsigned 64-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = (a.dword[i] &gt;&gt; imm) | (a.dword[i] &lt;&lt; (64 - imm));\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vrotri_h (__m128i a, imm0_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vrotri_h (__m128i a, imm0_15 imm)\n#include <lsxintrin.h>\nInstruction: vrotri.h vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nRotate right the unsigned 16-bit elements in `a` by `imm`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = (a.half[i] >> imm) | (a.half[i] << (16 - imm));\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vrotri_h (__m128i a, imm0_15 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vrotri.h vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Rotate right the unsigned 16-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = (a.half[i] &gt;&gt; imm) | (a.half[i] &lt;&lt; (16 - imm));\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vrotri_w (__m128i a, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vrotri_w (__m128i a, imm0_31 imm)\n#include <lsxintrin.h>\nInstruction: vrotri.w vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nRotate right the unsigned 32-bit elements in `a` by `imm`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = (a.word[i] >> imm) | (a.word[i] << (32 - imm));\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vrotri_w (__m128i a, imm0_31 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vrotri.w vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Rotate right the unsigned 32-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = (a.word[i] &gt;&gt; imm) | (a.word[i] &lt;&lt; (32 - imm));\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsadd_b (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsadd_b (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vsadd.b vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nSaturing add the signed 8-bit elements in `a` and `b`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = (s8)sadd((s8)a.byte[i], (s8)b.byte[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsadd_b (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsadd.b vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Saturing add the signed 8-bit elements in <code>a</code> and <code>b</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = (s8)sadd((s8)a.byte[i], (s8)b.byte[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsadd_bu (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsadd_bu (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vsadd.bu vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nSaturing add the unsigned 8-bit elements in `a` and `b`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = (u8)sadd((u8)a.byte[i], (u8)b.byte[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsadd_bu (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsadd.bu vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Saturing add the unsigned 8-bit elements in <code>a</code> and <code>b</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = (u8)sadd((u8)a.byte[i], (u8)b.byte[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsadd_d (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsadd_d (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vsadd.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nSaturing add the signed 64-bit elements in `a` and `b`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = (s64)sadd((s64)a.dword[i], (s64)b.dword[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsadd_d (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsadd.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Saturing add the signed 64-bit elements in <code>a</code> and <code>b</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = (s64)sadd((s64)a.dword[i], (s64)b.dword[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsadd_du (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsadd_du (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vsadd.du vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nSaturing add the unsigned 64-bit elements in `a` and `b`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = (u64)sadd((u64)a.dword[i], (u64)b.dword[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsadd_du (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsadd.du vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Saturing add the unsigned 64-bit elements in <code>a</code> and <code>b</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = (u64)sadd((u64)a.dword[i], (u64)b.dword[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsadd_h (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsadd_h (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vsadd.h vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nSaturing add the signed 16-bit elements in `a` and `b`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = (s16)sadd((s16)a.half[i], (s16)b.half[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsadd_h (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsadd.h vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Saturing add the signed 16-bit elements in <code>a</code> and <code>b</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = (s16)sadd((s16)a.half[i], (s16)b.half[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsadd_hu (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsadd_hu (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vsadd.hu vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nSaturing add the unsigned 16-bit elements in `a` and `b`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = (u16)sadd((u16)a.half[i], (u16)b.half[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsadd_hu (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsadd.hu vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Saturing add the unsigned 16-bit elements in <code>a</code> and <code>b</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = (u16)sadd((u16)a.half[i], (u16)b.half[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsadd_w (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsadd_w (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vsadd.w vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nSaturing add the signed 32-bit elements in `a` and `b`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = (s32)sadd((s32)a.word[i], (s32)b.word[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsadd_w (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsadd.w vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Saturing add the signed 32-bit elements in <code>a</code> and <code>b</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = (s32)sadd((s32)a.word[i], (s32)b.word[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsadd_wu (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsadd_wu (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vsadd.wu vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nSaturing add the unsigned 32-bit elements in `a` and `b`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = (u32)sadd((u32)a.word[i], (u32)b.word[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsadd_wu (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsadd.wu vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Saturing add the unsigned 32-bit elements in <code>a</code> and <code>b</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = (u32)sadd((u32)a.word[i], (u32)b.word[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsat_b (__m128i a, imm0_7 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsat_b (__m128i a, imm0_7 imm)\n#include <lsxintrin.h>\nInstruction: vsat.b vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nClamp signed 8-bit elements in `a` to range specified by `imm`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = clamp<s8>(a.byte[i], -(1 << imm), (1 << imm) - 1);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsat_b (__m128i a, imm0_7 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsat.b vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Clamp signed 8-bit elements in <code>a</code> to range specified by <code>imm</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = clamp&lt;s8&gt;(a.byte[i], -(1 &lt;&lt; imm), (1 &lt;&lt; imm) - 1);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsat_bu (__m128i a, imm0_7 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsat_bu (__m128i a, imm0_7 imm)\n#include <lsxintrin.h>\nInstruction: vsat.bu vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nClamp unsigned 8-bit elements in `a` to range specified by `imm`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = clamp<u8>(a.byte[i], 0, (1 << (imm + 1)) - 1);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsat_bu (__m128i a, imm0_7 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsat.bu vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Clamp unsigned 8-bit elements in <code>a</code> to range specified by <code>imm</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = clamp&lt;u8&gt;(a.byte[i], 0, (1 &lt;&lt; (imm + 1)) - 1);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsat_d (__m128i a, imm0_63 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsat_d (__m128i a, imm0_63 imm)\n#include <lsxintrin.h>\nInstruction: vsat.d vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nClamp signed 64-bit elements in `a` to range specified by `imm`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = clamp<s64>(a.dword[i], -(1 << imm), (1 << imm) - 1);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsat_d (__m128i a, imm0_63 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsat.d vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Clamp signed 64-bit elements in <code>a</code> to range specified by <code>imm</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = clamp&lt;s64&gt;(a.dword[i], -(1 &lt;&lt; imm), (1 &lt;&lt; imm) - 1);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsat_du (__m128i a, imm0_63 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsat_du (__m128i a, imm0_63 imm)\n#include <lsxintrin.h>\nInstruction: vsat.du vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nClamp unsigned 64-bit elements in `a` to range specified by `imm`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = clamp<u64>(a.dword[i], 0, (1 << (imm + 1)) - 1);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsat_du (__m128i a, imm0_63 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsat.du vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Clamp unsigned 64-bit elements in <code>a</code> to range specified by <code>imm</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = clamp&lt;u64&gt;(a.dword[i], 0, (1 &lt;&lt; (imm + 1)) - 1);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsat_h (__m128i a, imm0_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsat_h (__m128i a, imm0_15 imm)\n#include <lsxintrin.h>\nInstruction: vsat.h vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nClamp signed 16-bit elements in `a` to range specified by `imm`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = clamp<s16>(a.half[i], -(1 << imm), (1 << imm) - 1);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsat_h (__m128i a, imm0_15 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsat.h vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Clamp signed 16-bit elements in <code>a</code> to range specified by <code>imm</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = clamp&lt;s16&gt;(a.half[i], -(1 &lt;&lt; imm), (1 &lt;&lt; imm) - 1);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsat_hu (__m128i a, imm0_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsat_hu (__m128i a, imm0_15 imm)\n#include <lsxintrin.h>\nInstruction: vsat.hu vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nClamp unsigned 16-bit elements in `a` to range specified by `imm`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = clamp<u16>(a.half[i], 0, (1 << (imm + 1)) - 1);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsat_hu (__m128i a, imm0_15 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsat.hu vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Clamp unsigned 16-bit elements in <code>a</code> to range specified by <code>imm</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = clamp&lt;u16&gt;(a.half[i], 0, (1 &lt;&lt; (imm + 1)) - 1);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsat_w (__m128i a, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsat_w (__m128i a, imm0_31 imm)\n#include <lsxintrin.h>\nInstruction: vsat.w vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nClamp signed 32-bit elements in `a` to range specified by `imm`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = clamp<s32>(a.word[i], -(1 << imm), (1 << imm) - 1);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsat_w (__m128i a, imm0_31 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsat.w vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Clamp signed 32-bit elements in <code>a</code> to range specified by <code>imm</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = clamp&lt;s32&gt;(a.word[i], -(1 &lt;&lt; imm), (1 &lt;&lt; imm) - 1);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsat_wu (__m128i a, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsat_wu (__m128i a, imm0_31 imm)\n#include <lsxintrin.h>\nInstruction: vsat.wu vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nClamp unsigned 32-bit elements in `a` to range specified by `imm`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = clamp<u32>(a.word[i], 0, (1 << (imm + 1)) - 1);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsat_wu (__m128i a, imm0_31 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsat.wu vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Clamp unsigned 32-bit elements in <code>a</code> to range specified by <code>imm</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = clamp&lt;u32&gt;(a.word[i], 0, (1 &lt;&lt; (imm + 1)) - 1);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vseq_b (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vseq_b (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vseq.b vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompare the 8-bit elements in `a` and `b`, store all-ones to `dst` if equal, zero otherwise.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = (a.byte[i] == b.byte[i]) ? 0xFF : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vseq_b (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vseq.b vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare the 8-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if equal, zero otherwise.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = (a.byte[i] == b.byte[i]) ? 0xFF : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Comparison", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vseq_d (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vseq_d (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vseq.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompare the 64-bit elements in `a` and `b`, store all-ones to `dst` if equal, zero otherwise.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = (a.dword[i] == b.dword[i]) ? 0xFFFFFFFFFFFFFFFF : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vseq_d (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vseq.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare the 64-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if equal, zero otherwise.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = (a.dword[i] == b.dword[i]) ? 0xFFFFFFFFFFFFFFFF : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Comparison", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vseq_h (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vseq_h (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vseq.h vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompare the 16-bit elements in `a` and `b`, store all-ones to `dst` if equal, zero otherwise.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = (a.half[i] == b.half[i]) ? 0xFFFF : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vseq_h (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vseq.h vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare the 16-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if equal, zero otherwise.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = (a.half[i] == b.half[i]) ? 0xFFFF : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Comparison", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vseq_w (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vseq_w (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vseq.w vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompare the 32-bit elements in `a` and `b`, store all-ones to `dst` if equal, zero otherwise.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = (a.word[i] == b.word[i]) ? 0xFFFFFFFF : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vseq_w (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vseq.w vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare the 32-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if equal, zero otherwise.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = (a.word[i] == b.word[i]) ? 0xFFFFFFFF : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Comparison", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vseqi_b (__m128i a, imm_n16_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vseqi_b (__m128i a, imm_n16_15 imm)\n#include <lsxintrin.h>\nInstruction: vseqi.b vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nCompare the 8-bit elements in `a` and `imm`, store all-ones to `dst` if equal, zero otherwise.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = ((s8)a.byte[i] == imm) ? 0xFF : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vseqi_b (__m128i a, imm_n16_15 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vseqi.b vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare the 8-bit elements in <code>a</code> and <code>imm</code>, store all-ones to <code>dst</code> if equal, zero otherwise.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = ((s8)a.byte[i] == imm) ? 0xFF : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Comparison", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vseqi_d (__m128i a, imm_n16_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vseqi_d (__m128i a, imm_n16_15 imm)\n#include <lsxintrin.h>\nInstruction: vseqi.d vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nCompare the 64-bit elements in `a` and `imm`, store all-ones to `dst` if equal, zero otherwise.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = ((s64)a.dword[i] == imm) ? 0xFFFFFFFFFFFFFFFF : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vseqi_d (__m128i a, imm_n16_15 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vseqi.d vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare the 64-bit elements in <code>a</code> and <code>imm</code>, store all-ones to <code>dst</code> if equal, zero otherwise.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = ((s64)a.dword[i] == imm) ? 0xFFFFFFFFFFFFFFFF : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Comparison", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vseqi_h (__m128i a, imm_n16_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vseqi_h (__m128i a, imm_n16_15 imm)\n#include <lsxintrin.h>\nInstruction: vseqi.h vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nCompare the 16-bit elements in `a` and `imm`, store all-ones to `dst` if equal, zero otherwise.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = ((s16)a.half[i] == imm) ? 0xFFFF : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vseqi_h (__m128i a, imm_n16_15 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vseqi.h vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare the 16-bit elements in <code>a</code> and <code>imm</code>, store all-ones to <code>dst</code> if equal, zero otherwise.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = ((s16)a.half[i] == imm) ? 0xFFFF : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Comparison", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vseqi_w (__m128i a, imm_n16_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vseqi_w (__m128i a, imm_n16_15 imm)\n#include <lsxintrin.h>\nInstruction: vseqi.w vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nCompare the 32-bit elements in `a` and `imm`, store all-ones to `dst` if equal, zero otherwise.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = ((s32)a.word[i] == imm) ? 0xFFFFFFFF : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vseqi_w (__m128i a, imm_n16_15 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vseqi.w vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare the 32-bit elements in <code>a</code> and <code>imm</code>, store all-ones to <code>dst</code> if equal, zero otherwise.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = ((s32)a.word[i] == imm) ? 0xFFFFFFFF : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Comparison", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vshuf4i_b (__m128i a, imm0_255 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vshuf4i_b (__m128i a, imm0_255 imm)\n#include <lsxintrin.h>\nInstruction: vshuf4i.b vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nShuffle every four 8-bit elements in `a` with indices packed in `imm`, save the result to `dst`.\n\n![](../diagram/vshuf4i_b.svg)\n\n\n### Examples\n\n```c++\n__m128i __lsx_vshuf4i_b(__m128i{0xabcdef1314156678, 0x1234123443214321}, 0x12)\n= 0x13ef13cd78667815 0x3412343421432121\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = a.byte[(i & ~0x3) + ((imm >> (2 * (i & 0x3))) & 0x3)];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vshuf4i_b (__m128i a, imm0_255 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vshuf4i.b vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Shuffle every four 8-bit elements in <code>a</code> with indices packed in <code>imm</code>, save the result to <code>dst</code>.</p>\n<p><img alt=\"\" src=\"../diagram/vshuf4i_b.svg\" /></p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vshuf4i_b(__m128i{0xabcdef1314156678, 0x1234123443214321}, 0x12)\n= 0x13ef13cd78667815 0x3412343421432121\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = a.byte[(i &amp; ~0x3) + ((imm &gt;&gt; (2 * (i &amp; 0x3))) &amp; 0x3)];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shuffling", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vshuf4i_d (__m128i a, __m128i b, imm0_255 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vshuf4i_d (__m128i a, __m128i b, imm0_255 imm)\n#include <lsxintrin.h>\nInstruction: vshuf4i.d vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nShuffle every four 64-bit elements in `a` and `b` with indices packed in `imm`, save the result to `dst`.\n\n![](../diagram/vshuf4i_d.svg)\n\n\n### Examples\n\n```c++\n__m128i __lsx_vshuf4i_d(__m128i{0x1122334455667788, 0x99aabbccddeeff00}, __m128i{0xabcdef1314156678, 0x1234123443214321}, 0x12)\n= 0xabcdef1314156678 0x1122334455667788\n```\n\n\n### Operation\n\n```c++\ndst.dword[0] = (imm & 2) ? b.dword[(imm & 1)] : a.dword[(imm & 1)];\ndst.dword[1] =\n    (imm & 8) ? b.dword[((imm >> 2) & 1)] : a.dword[((imm >> 2) & 1)];\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vshuf4i_d (__m128i a, __m128i b, imm0_255 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vshuf4i.d vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Shuffle every four 64-bit elements in <code>a</code> and <code>b</code> with indices packed in <code>imm</code>, save the result to <code>dst</code>.</p>\n<p><img alt=\"\" src=\"../diagram/vshuf4i_d.svg\" /></p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vshuf4i_d(__m128i{0x1122334455667788, 0x99aabbccddeeff00}, __m128i{0xabcdef1314156678, 0x1234123443214321}, 0x12)\n= 0xabcdef1314156678 0x1122334455667788\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">dst.dword[0] = (imm &amp; 2) ? b.dword[(imm &amp; 1)] : a.dword[(imm &amp; 1)];\ndst.dword[1] =\n    (imm &amp; 8) ? b.dword[((imm &gt;&gt; 2) &amp; 1)] : a.dword[((imm &gt;&gt; 2) &amp; 1)];\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shuffling", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vshuf4i_h (__m128i a, imm0_255 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vshuf4i_h (__m128i a, imm0_255 imm)\n#include <lsxintrin.h>\nInstruction: vshuf4i.h vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nShuffle every four 16-bit elements in `a` with indices packed in `imm`, save the result to `dst`.\n\n![](../diagram/vshuf4i_h.svg)\n\n\n### Examples\n\n```c++\n__m128i __lsx_vshuf4i_h(__m128i{0xabcdef1314156678, 0x1234123443214321}, 0x12)\n= 0x667814156678ef13 0x4321432143211234\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = a.half[(i & ~0x3) + ((imm >> (2 * (i & 0x3))) & 0x3)];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vshuf4i_h (__m128i a, imm0_255 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vshuf4i.h vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Shuffle every four 16-bit elements in <code>a</code> with indices packed in <code>imm</code>, save the result to <code>dst</code>.</p>\n<p><img alt=\"\" src=\"../diagram/vshuf4i_h.svg\" /></p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vshuf4i_h(__m128i{0xabcdef1314156678, 0x1234123443214321}, 0x12)\n= 0x667814156678ef13 0x4321432143211234\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = a.half[(i &amp; ~0x3) + ((imm &gt;&gt; (2 * (i &amp; 0x3))) &amp; 0x3)];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shuffling", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vshuf4i_w (__m128i a, imm0_255 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vshuf4i_w (__m128i a, imm0_255 imm)\n#include <lsxintrin.h>\nInstruction: vshuf4i.w vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nShuffle every four 32-bit elements in `a` with indices packed in `imm`, save the result to `dst`.\n\n![](../diagram/vshuf4i_w.svg)\n\n\n### Examples\n\n```c++\n__m128i __lsx_vshuf4i_w(__m128i{0xabcdef1314156678, 0x1234123443214321}, 0x12)\n= 0x1415667843214321 0x14156678abcdef13\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = a.word[(i & ~0x3) + ((imm >> (2 * (i & 0x3))) & 0x3)];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vshuf4i_w (__m128i a, imm0_255 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vshuf4i.w vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Shuffle every four 32-bit elements in <code>a</code> with indices packed in <code>imm</code>, save the result to <code>dst</code>.</p>\n<p><img alt=\"\" src=\"../diagram/vshuf4i_w.svg\" /></p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vshuf4i_w(__m128i{0xabcdef1314156678, 0x1234123443214321}, 0x12)\n= 0x1415667843214321 0x14156678abcdef13\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = a.word[(i &amp; ~0x3) + ((imm &gt;&gt; (2 * (i &amp; 0x3))) &amp; 0x3)];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shuffling", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vshuf_b (__m128i a, __m128i b, __m128i c)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vshuf_b (__m128i a, __m128i b, __m128i c)\n#include <lsxintrin.h>\nInstruction: vshuf.b vr, vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\n\nShuffle bytes from `a` and `b` with indices from `c`.\n\nCaveat: the indices are placed in `c`, while in other `vshuf` intrinsics, they are placed in `a`.\n\n\n![](../diagram/vshuf_b.svg)\n\n\n### Examples\n\n```c++\n__m128i __lsx_vshuf_b(__m128i{0x1122334455667788, 0x99aabbccddeeff00}, __m128i{0xabcdef1314156678, 0x1234123443214321}, __m128i{0x0011021304050607, 0x0811120213031404})\n= 0x7877155513efcdab 0x2177661555144413\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  if (c.byte[i] >= 64 && MACHINE_3C5000) {\n    // Caveat: observed in 3C5000\n    dst.byte[i] = 0;\n  } else if ((c.byte[i] % 32) < 16) {\n    dst.byte[i] = b.byte[c.byte[i] % 16];\n  } else {\n    dst.byte[i] = a.byte[c.byte[i] % 16];\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 2 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vshuf_b (__m128i a, __m128i b, __m128i c)\n#include &lt;lsxintrin.h&gt;\nInstruction: vshuf.b vr, vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Shuffle bytes from <code>a</code> and <code>b</code> with indices from <code>c</code>.</p>\n<p>Caveat: the indices are placed in <code>c</code>, while in other <code>vshuf</code> intrinsics, they are placed in <code>a</code>.</p>\n<p><img alt=\"\" src=\"../diagram/vshuf_b.svg\" /></p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vshuf_b(__m128i{0x1122334455667788, 0x99aabbccddeeff00}, __m128i{0xabcdef1314156678, 0x1234123443214321}, __m128i{0x0011021304050607, 0x0811120213031404})\n= 0x7877155513efcdab 0x2177661555144413\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  if (c.byte[i] &gt;= 64 &amp;&amp; MACHINE_3C5000) {\n    // Caveat: observed in 3C5000\n    dst.byte[i] = 0;\n  } else if ((c.byte[i] % 32) &lt; 16) {\n    dst.byte[i] = b.byte[c.byte[i] % 16];\n  } else {\n    dst.byte[i] = a.byte[c.byte[i] % 16];\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shuffling", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vshuf_d (__m128i a, __m128i b, __m128i c)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vshuf_d (__m128i a, __m128i b, __m128i c)\n#include <lsxintrin.h>\nInstruction: vshuf.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nShuffle 64-bit elements in `b` and `c` with indices from `a`, save the result to `dst`.\n\n![](../diagram/vshuf_d.svg)\n\n\n### Examples\n\n```c++\n__m128i __lsx_vshuf_d(__m128i{0x0000000000000001, 0x0000000000000002}, __m128i{0x1122334455667788, 0x99aabbccddeeff00}, __m128i{0xabcdef1314156678, 0x1234123443214321})\n= 0x1234123443214321 0x1122334455667788\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  if ((a.dword[i] % 256) >= 64 && MACHINE_3C5000) {\n    // Caveat: observed in 3C5000\n    dst.dword[i] = 0;\n  } else if ((a.dword[i] % 4) < 2) {\n    dst.dword[i] = c.dword[a.dword[i] % 2];\n  } else {\n    dst.dword[i] = b.dword[a.dword[i] % 2];\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 2 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vshuf_d (__m128i a, __m128i b, __m128i c)\n#include &lt;lsxintrin.h&gt;\nInstruction: vshuf.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Shuffle 64-bit elements in <code>b</code> and <code>c</code> with indices from <code>a</code>, save the result to <code>dst</code>.</p>\n<p><img alt=\"\" src=\"../diagram/vshuf_d.svg\" /></p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vshuf_d(__m128i{0x0000000000000001, 0x0000000000000002}, __m128i{0x1122334455667788, 0x99aabbccddeeff00}, __m128i{0xabcdef1314156678, 0x1234123443214321})\n= 0x1234123443214321 0x1122334455667788\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  if ((a.dword[i] % 256) &gt;= 64 &amp;&amp; MACHINE_3C5000) {\n    // Caveat: observed in 3C5000\n    dst.dword[i] = 0;\n  } else if ((a.dword[i] % 4) &lt; 2) {\n    dst.dword[i] = c.dword[a.dword[i] % 2];\n  } else {\n    dst.dword[i] = b.dword[a.dword[i] % 2];\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shuffling", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vshuf_h (__m128i a, __m128i b, __m128i c)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vshuf_h (__m128i a, __m128i b, __m128i c)\n#include <lsxintrin.h>\nInstruction: vshuf.h vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nShuffle 16-bit elements in `b` and `c` with indices from `a`, save the result to `dst`.\n\n![](../diagram/vshuf_h.svg)\n\n\n### Examples\n\n```c++\n__m128i __lsx_vshuf_h(__m128i{0x0001000200030004, 0x0005000a000b000c}, __m128i{0x1122334455667788, 0x99aabbccddeeff00}, __m128i{0xabcdef1314156678, 0x1234123443214321})\n= 0x1415ef13abcd4321 0x432133441122ff00\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  if ((a.half[i] % 256) >= 64 && MACHINE_3C5000) {\n    // Caveat: observed in 3C5000\n    dst.half[i] = 0;\n  } else if ((a.half[i] % 16) < 8) {\n    dst.half[i] = c.half[a.half[i] % 8];\n  } else {\n    dst.half[i] = b.half[a.half[i] % 8];\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 2 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vshuf_h (__m128i a, __m128i b, __m128i c)\n#include &lt;lsxintrin.h&gt;\nInstruction: vshuf.h vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Shuffle 16-bit elements in <code>b</code> and <code>c</code> with indices from <code>a</code>, save the result to <code>dst</code>.</p>\n<p><img alt=\"\" src=\"../diagram/vshuf_h.svg\" /></p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vshuf_h(__m128i{0x0001000200030004, 0x0005000a000b000c}, __m128i{0x1122334455667788, 0x99aabbccddeeff00}, __m128i{0xabcdef1314156678, 0x1234123443214321})\n= 0x1415ef13abcd4321 0x432133441122ff00\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  if ((a.half[i] % 256) &gt;= 64 &amp;&amp; MACHINE_3C5000) {\n    // Caveat: observed in 3C5000\n    dst.half[i] = 0;\n  } else if ((a.half[i] % 16) &lt; 8) {\n    dst.half[i] = c.half[a.half[i] % 8];\n  } else {\n    dst.half[i] = b.half[a.half[i] % 8];\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shuffling", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vshuf_w (__m128i a, __m128i b, __m128i c)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vshuf_w (__m128i a, __m128i b, __m128i c)\n#include <lsxintrin.h>\nInstruction: vshuf.w vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nShuffle 32-bit elements in `b` and `c` with indices from `a`, save the result to `dst`.\n\n![](../diagram/vshuf_w.svg)\n\n\n### Examples\n\n```c++\n__m128i __lsx_vshuf_w(__m128i{0x0000000200000004, 0x0000000700000005}, __m128i{0x1122334455667788, 0x99aabbccddeeff00}, __m128i{0xabcdef1314156678, 0x1234123443214321})\n= 0x4321432155667788 0x99aabbcc11223344\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if ((a.word[i] % 256) >= 64 && MACHINE_3C5000) {\n    // Caveat: observed in 3C5000\n    dst.word[i] = 0;\n  } else if ((a.word[i] % 8) < 4) {\n    dst.word[i] = c.word[a.word[i] % 4];\n  } else {\n    dst.word[i] = b.word[a.word[i] % 4];\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 2 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vshuf_w (__m128i a, __m128i b, __m128i c)\n#include &lt;lsxintrin.h&gt;\nInstruction: vshuf.w vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Shuffle 32-bit elements in <code>b</code> and <code>c</code> with indices from <code>a</code>, save the result to <code>dst</code>.</p>\n<p><img alt=\"\" src=\"../diagram/vshuf_w.svg\" /></p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vshuf_w(__m128i{0x0000000200000004, 0x0000000700000005}, __m128i{0x1122334455667788, 0x99aabbccddeeff00}, __m128i{0xabcdef1314156678, 0x1234123443214321})\n= 0x4321432155667788 0x99aabbcc11223344\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if ((a.word[i] % 256) &gt;= 64 &amp;&amp; MACHINE_3C5000) {\n    // Caveat: observed in 3C5000\n    dst.word[i] = 0;\n  } else if ((a.word[i] % 8) &lt; 4) {\n    dst.word[i] = c.word[a.word[i] % 4];\n  } else {\n    dst.word[i] = b.word[a.word[i] % 4];\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shuffling", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsigncov_b (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsigncov_b (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vsigncov.b vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nIf the 8-bit element in `a` equals to zero, set the result to zero. If the signed 8-bit element in `a` is positive, copy element in `b` to result. Otherwise, copy negated element in `b` to result. If `a` and `b` are the same vectors, it is equivalent to computing absolute value.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] =\n      (a.byte[i] == 0) ? 0 : ((s8)a.byte[i] > 0 ? b.byte[i] : -b.byte[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 2 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsigncov_b (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsigncov.b vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>If the 8-bit element in <code>a</code> equals to zero, set the result to zero. If the signed 8-bit element in <code>a</code> is positive, copy element in <code>b</code> to result. Otherwise, copy negated element in <code>b</code> to result. If <code>a</code> and <code>b</code> are the same vectors, it is equivalent to computing absolute value.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] =\n      (a.byte[i] == 0) ? 0 : ((s8)a.byte[i] &gt; 0 ? b.byte[i] : -b.byte[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsigncov_d (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsigncov_d (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vsigncov.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nIf the 64-bit element in `a` equals to zero, set the result to zero. If the signed 64-bit element in `a` is positive, copy element in `b` to result. Otherwise, copy negated element in `b` to result. If `a` and `b` are the same vectors, it is equivalent to computing absolute value.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] =\n      (a.dword[i] == 0) ? 0 : ((s64)a.dword[i] > 0 ? b.dword[i] : -b.dword[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 2 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsigncov_d (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsigncov.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>If the 64-bit element in <code>a</code> equals to zero, set the result to zero. If the signed 64-bit element in <code>a</code> is positive, copy element in <code>b</code> to result. Otherwise, copy negated element in <code>b</code> to result. If <code>a</code> and <code>b</code> are the same vectors, it is equivalent to computing absolute value.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] =\n      (a.dword[i] == 0) ? 0 : ((s64)a.dword[i] &gt; 0 ? b.dword[i] : -b.dword[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsigncov_h (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsigncov_h (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vsigncov.h vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nIf the 16-bit element in `a` equals to zero, set the result to zero. If the signed 16-bit element in `a` is positive, copy element in `b` to result. Otherwise, copy negated element in `b` to result. If `a` and `b` are the same vectors, it is equivalent to computing absolute value.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] =\n      (a.half[i] == 0) ? 0 : ((s16)a.half[i] > 0 ? b.half[i] : -b.half[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 2 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsigncov_h (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsigncov.h vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>If the 16-bit element in <code>a</code> equals to zero, set the result to zero. If the signed 16-bit element in <code>a</code> is positive, copy element in <code>b</code> to result. Otherwise, copy negated element in <code>b</code> to result. If <code>a</code> and <code>b</code> are the same vectors, it is equivalent to computing absolute value.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] =\n      (a.half[i] == 0) ? 0 : ((s16)a.half[i] &gt; 0 ? b.half[i] : -b.half[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsigncov_w (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsigncov_w (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vsigncov.w vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nIf the 32-bit element in `a` equals to zero, set the result to zero. If the signed 32-bit element in `a` is positive, copy element in `b` to result. Otherwise, copy negated element in `b` to result. If `a` and `b` are the same vectors, it is equivalent to computing absolute value.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] =\n      (a.word[i] == 0) ? 0 : ((s32)a.word[i] > 0 ? b.word[i] : -b.word[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 2 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsigncov_w (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsigncov.w vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>If the 32-bit element in <code>a</code> equals to zero, set the result to zero. If the signed 32-bit element in <code>a</code> is positive, copy element in <code>b</code> to result. Otherwise, copy negated element in <code>b</code> to result. If <code>a</code> and <code>b</code> are the same vectors, it is equivalent to computing absolute value.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] =\n      (a.word[i] == 0) ? 0 : ((s32)a.word[i] &gt; 0 ? b.word[i] : -b.word[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsle_b (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsle_b (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vsle.b vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompare the signed 8-bit elements in `a` and `b`, store all-ones to `dst` if corresponding element in `a` is less than or equal `b`, zero otherwise.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = ((s8)a.byte[i] <= (s8)b.byte[i]) ? 0xFF : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsle_b (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsle.b vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare the signed 8-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than or equal <code>b</code>, zero otherwise.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = ((s8)a.byte[i] &lt;= (s8)b.byte[i]) ? 0xFF : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Comparison", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsle_bu (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsle_bu (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vsle.bu vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompare the unsigned 8-bit elements in `a` and `b`, store all-ones to `dst` if corresponding element in `a` is less than or equal `b`, zero otherwise.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = ((u8)a.byte[i] <= (u8)b.byte[i]) ? 0xFF : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsle_bu (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsle.bu vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare the unsigned 8-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than or equal <code>b</code>, zero otherwise.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = ((u8)a.byte[i] &lt;= (u8)b.byte[i]) ? 0xFF : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Comparison", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsle_d (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsle_d (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vsle.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompare the signed 64-bit elements in `a` and `b`, store all-ones to `dst` if corresponding element in `a` is less than or equal `b`, zero otherwise.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = ((s64)a.dword[i] <= (s64)b.dword[i]) ? 0xFFFFFFFFFFFFFFFF : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsle_d (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsle.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare the signed 64-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than or equal <code>b</code>, zero otherwise.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = ((s64)a.dword[i] &lt;= (s64)b.dword[i]) ? 0xFFFFFFFFFFFFFFFF : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Comparison", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsle_du (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsle_du (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vsle.du vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompare the unsigned 64-bit elements in `a` and `b`, store all-ones to `dst` if corresponding element in `a` is less than or equal `b`, zero otherwise.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = ((u64)a.dword[i] <= (u64)b.dword[i]) ? 0xFFFFFFFFFFFFFFFF : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsle_du (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsle.du vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare the unsigned 64-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than or equal <code>b</code>, zero otherwise.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = ((u64)a.dword[i] &lt;= (u64)b.dword[i]) ? 0xFFFFFFFFFFFFFFFF : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Comparison", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsle_h (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsle_h (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vsle.h vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompare the signed 16-bit elements in `a` and `b`, store all-ones to `dst` if corresponding element in `a` is less than or equal `b`, zero otherwise.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = ((s16)a.half[i] <= (s16)b.half[i]) ? 0xFFFF : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsle_h (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsle.h vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare the signed 16-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than or equal <code>b</code>, zero otherwise.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = ((s16)a.half[i] &lt;= (s16)b.half[i]) ? 0xFFFF : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Comparison", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsle_hu (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsle_hu (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vsle.hu vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompare the unsigned 16-bit elements in `a` and `b`, store all-ones to `dst` if corresponding element in `a` is less than or equal `b`, zero otherwise.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = ((u16)a.half[i] <= (u16)b.half[i]) ? 0xFFFF : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsle_hu (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsle.hu vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare the unsigned 16-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than or equal <code>b</code>, zero otherwise.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = ((u16)a.half[i] &lt;= (u16)b.half[i]) ? 0xFFFF : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Comparison", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsle_w (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsle_w (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vsle.w vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompare the signed 32-bit elements in `a` and `b`, store all-ones to `dst` if corresponding element in `a` is less than or equal `b`, zero otherwise.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = ((s32)a.word[i] <= (s32)b.word[i]) ? 0xFFFFFFFF : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsle_w (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsle.w vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare the signed 32-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than or equal <code>b</code>, zero otherwise.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = ((s32)a.word[i] &lt;= (s32)b.word[i]) ? 0xFFFFFFFF : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Comparison", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsle_wu (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsle_wu (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vsle.wu vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompare the unsigned 32-bit elements in `a` and `b`, store all-ones to `dst` if corresponding element in `a` is less than or equal `b`, zero otherwise.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = ((u32)a.word[i] <= (u32)b.word[i]) ? 0xFFFFFFFF : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsle_wu (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsle.wu vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare the unsigned 32-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than or equal <code>b</code>, zero otherwise.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = ((u32)a.word[i] &lt;= (u32)b.word[i]) ? 0xFFFFFFFF : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Comparison", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vslei_b (__m128i a, imm_n16_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vslei_b (__m128i a, imm_n16_15 imm)\n#include <lsxintrin.h>\nInstruction: vslei.b vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nCompare the signed 8-bit elements in `a` and `b`, store all-ones to `dst` if corresponding element in `a` is less than or equal `b`, zero otherwise.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = ((s8)a.byte[i] <= imm) ? 0xFF : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vslei_b (__m128i a, imm_n16_15 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vslei.b vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare the signed 8-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than or equal <code>b</code>, zero otherwise.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = ((s8)a.byte[i] &lt;= imm) ? 0xFF : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Comparison", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vslei_bu (__m128i a, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vslei_bu (__m128i a, imm0_31 imm)\n#include <lsxintrin.h>\nInstruction: vslei.bu vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nCompare the unsigned 8-bit elements in `a` and `b`, store all-ones to `dst` if corresponding element in `a` is less than or equal `b`, zero otherwise.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = ((u8)a.byte[i] <= imm) ? 0xFF : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vslei_bu (__m128i a, imm0_31 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vslei.bu vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare the unsigned 8-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than or equal <code>b</code>, zero otherwise.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = ((u8)a.byte[i] &lt;= imm) ? 0xFF : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Comparison", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vslei_d (__m128i a, imm_n16_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vslei_d (__m128i a, imm_n16_15 imm)\n#include <lsxintrin.h>\nInstruction: vslei.d vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nCompare the signed 64-bit elements in `a` and `b`, store all-ones to `dst` if corresponding element in `a` is less than or equal `b`, zero otherwise.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = ((s64)a.dword[i] <= imm) ? 0xFFFFFFFFFFFFFFFF : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vslei_d (__m128i a, imm_n16_15 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vslei.d vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare the signed 64-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than or equal <code>b</code>, zero otherwise.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = ((s64)a.dword[i] &lt;= imm) ? 0xFFFFFFFFFFFFFFFF : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Comparison", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vslei_du (__m128i a, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vslei_du (__m128i a, imm0_31 imm)\n#include <lsxintrin.h>\nInstruction: vslei.du vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nCompare the unsigned 64-bit elements in `a` and `b`, store all-ones to `dst` if corresponding element in `a` is less than or equal `b`, zero otherwise.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = ((u64)a.dword[i] <= imm) ? 0xFFFFFFFFFFFFFFFF : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vslei_du (__m128i a, imm0_31 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vslei.du vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare the unsigned 64-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than or equal <code>b</code>, zero otherwise.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = ((u64)a.dword[i] &lt;= imm) ? 0xFFFFFFFFFFFFFFFF : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Comparison", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vslei_h (__m128i a, imm_n16_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vslei_h (__m128i a, imm_n16_15 imm)\n#include <lsxintrin.h>\nInstruction: vslei.h vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nCompare the signed 16-bit elements in `a` and `b`, store all-ones to `dst` if corresponding element in `a` is less than or equal `b`, zero otherwise.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = ((s16)a.half[i] <= imm) ? 0xFFFF : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vslei_h (__m128i a, imm_n16_15 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vslei.h vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare the signed 16-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than or equal <code>b</code>, zero otherwise.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = ((s16)a.half[i] &lt;= imm) ? 0xFFFF : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Comparison", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vslei_hu (__m128i a, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vslei_hu (__m128i a, imm0_31 imm)\n#include <lsxintrin.h>\nInstruction: vslei.hu vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nCompare the unsigned 16-bit elements in `a` and `b`, store all-ones to `dst` if corresponding element in `a` is less than or equal `b`, zero otherwise.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = ((u16)a.half[i] <= imm) ? 0xFFFF : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vslei_hu (__m128i a, imm0_31 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vslei.hu vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare the unsigned 16-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than or equal <code>b</code>, zero otherwise.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = ((u16)a.half[i] &lt;= imm) ? 0xFFFF : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Comparison", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vslei_w (__m128i a, imm_n16_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vslei_w (__m128i a, imm_n16_15 imm)\n#include <lsxintrin.h>\nInstruction: vslei.w vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nCompare the signed 32-bit elements in `a` and `b`, store all-ones to `dst` if corresponding element in `a` is less than or equal `b`, zero otherwise.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = ((s32)a.word[i] <= imm) ? 0xFFFFFFFF : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vslei_w (__m128i a, imm_n16_15 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vslei.w vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare the signed 32-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than or equal <code>b</code>, zero otherwise.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = ((s32)a.word[i] &lt;= imm) ? 0xFFFFFFFF : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Comparison", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vslei_wu (__m128i a, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vslei_wu (__m128i a, imm0_31 imm)\n#include <lsxintrin.h>\nInstruction: vslei.wu vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nCompare the unsigned 32-bit elements in `a` and `b`, store all-ones to `dst` if corresponding element in `a` is less than or equal `b`, zero otherwise.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = ((u32)a.word[i] <= imm) ? 0xFFFFFFFF : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vslei_wu (__m128i a, imm0_31 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vslei.wu vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare the unsigned 32-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than or equal <code>b</code>, zero otherwise.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = ((u32)a.word[i] &lt;= imm) ? 0xFFFFFFFF : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Comparison", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsll_b (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsll_b (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vsll.b vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nLogical left shift the unsigned 8-bit elements in `a` by elements in `b`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = a.byte[i] << (b.byte[i] & 0x7);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsll_b (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsll.b vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical left shift the unsigned 8-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = a.byte[i] &lt;&lt; (b.byte[i] &amp; 0x7);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsll_d (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsll_d (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vsll.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nLogical left shift the unsigned 64-bit elements in `a` by elements in `b`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = a.dword[i] << (b.dword[i] & 0x3f);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsll_d (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsll.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical left shift the unsigned 64-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = a.dword[i] &lt;&lt; (b.dword[i] &amp; 0x3f);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsll_h (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsll_h (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vsll.h vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nLogical left shift the unsigned 16-bit elements in `a` by elements in `b`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = a.half[i] << (b.half[i] & 0xf);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsll_h (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsll.h vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical left shift the unsigned 16-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = a.half[i] &lt;&lt; (b.half[i] &amp; 0xf);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsll_w (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsll_w (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vsll.w vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nLogical left shift the unsigned 32-bit elements in `a` by elements in `b`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = a.word[i] << (b.word[i] & 0x1f);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsll_w (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsll.w vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical left shift the unsigned 32-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = a.word[i] &lt;&lt; (b.word[i] &amp; 0x1f);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vslli_b (__m128i a, imm0_7 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vslli_b (__m128i a, imm0_7 imm)\n#include <lsxintrin.h>\nInstruction: vslli.b vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nLogical left shift the unsigned 8-bit elements in `a` by `imm`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = a.byte[i] << imm;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vslli_b (__m128i a, imm0_7 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vslli.b vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical left shift the unsigned 8-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = a.byte[i] &lt;&lt; imm;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vslli_d (__m128i a, imm0_63 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vslli_d (__m128i a, imm0_63 imm)\n#include <lsxintrin.h>\nInstruction: vslli.d vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nLogical left shift the unsigned 64-bit elements in `a` by `imm`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = a.dword[i] << imm;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vslli_d (__m128i a, imm0_63 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vslli.d vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical left shift the unsigned 64-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = a.dword[i] &lt;&lt; imm;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vslli_h (__m128i a, imm0_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vslli_h (__m128i a, imm0_15 imm)\n#include <lsxintrin.h>\nInstruction: vslli.h vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nLogical left shift the unsigned 16-bit elements in `a` by `imm`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = a.half[i] << imm;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vslli_h (__m128i a, imm0_15 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vslli.h vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical left shift the unsigned 16-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = a.half[i] &lt;&lt; imm;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vslli_w (__m128i a, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vslli_w (__m128i a, imm0_31 imm)\n#include <lsxintrin.h>\nInstruction: vslli.w vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nLogical left shift the unsigned 32-bit elements in `a` by `imm`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = a.word[i] << imm;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vslli_w (__m128i a, imm0_31 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vslli.w vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical left shift the unsigned 32-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = a.word[i] &lt;&lt; imm;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsllwil_d_w (__m128i a, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsllwil_d_w (__m128i a, imm0_31 imm)\n#include <lsxintrin.h>\nInstruction: vsllwil.d.w vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nExtend and shift signed 32-bit elements in `a` by `imm` to signed 64-bit result.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = (s64)(s32)a.word[i] << imm;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsllwil_d_w (__m128i a, imm0_31 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsllwil.d.w vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Extend and shift signed 32-bit elements in <code>a</code> by <code>imm</code> to signed 64-bit result.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = (s64)(s32)a.word[i] &lt;&lt; imm;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsllwil_du_wu (__m128i a, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsllwil_du_wu (__m128i a, imm0_31 imm)\n#include <lsxintrin.h>\nInstruction: vsllwil.du.wu vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nExtend and shift unsigned 32-bit elements in `a` by `imm` to unsigned 64-bit result.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = (u64)(u32)a.word[i] << imm;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsllwil_du_wu (__m128i a, imm0_31 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsllwil.du.wu vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Extend and shift unsigned 32-bit elements in <code>a</code> by <code>imm</code> to unsigned 64-bit result.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = (u64)(u32)a.word[i] &lt;&lt; imm;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsllwil_h_b (__m128i a, imm0_7 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsllwil_h_b (__m128i a, imm0_7 imm)\n#include <lsxintrin.h>\nInstruction: vsllwil.h.b vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nExtend and shift signed 8-bit elements in `a` by `imm` to signed 16-bit result.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = (s16)(s8)a.byte[i] << imm;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsllwil_h_b (__m128i a, imm0_7 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsllwil.h.b vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Extend and shift signed 8-bit elements in <code>a</code> by <code>imm</code> to signed 16-bit result.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = (s16)(s8)a.byte[i] &lt;&lt; imm;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsllwil_hu_bu (__m128i a, imm0_7 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsllwil_hu_bu (__m128i a, imm0_7 imm)\n#include <lsxintrin.h>\nInstruction: vsllwil.hu.bu vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nExtend and shift unsigned 8-bit elements in `a` by `imm` to unsigned 16-bit result.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = (u16)(u8)a.byte[i] << imm;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsllwil_hu_bu (__m128i a, imm0_7 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsllwil.hu.bu vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Extend and shift unsigned 8-bit elements in <code>a</code> by <code>imm</code> to unsigned 16-bit result.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = (u16)(u8)a.byte[i] &lt;&lt; imm;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsllwil_w_h (__m128i a, imm0_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsllwil_w_h (__m128i a, imm0_15 imm)\n#include <lsxintrin.h>\nInstruction: vsllwil.w.h vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nExtend and shift signed 16-bit elements in `a` by `imm` to signed 32-bit result.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = (s32)(s16)a.half[i] << imm;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsllwil_w_h (__m128i a, imm0_15 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsllwil.w.h vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Extend and shift signed 16-bit elements in <code>a</code> by <code>imm</code> to signed 32-bit result.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = (s32)(s16)a.half[i] &lt;&lt; imm;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsllwil_wu_hu (__m128i a, imm0_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsllwil_wu_hu (__m128i a, imm0_15 imm)\n#include <lsxintrin.h>\nInstruction: vsllwil.wu.hu vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nExtend and shift unsigned 16-bit elements in `a` by `imm` to unsigned 32-bit result.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = (u32)(u16)a.half[i] << imm;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsllwil_wu_hu (__m128i a, imm0_15 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsllwil.wu.hu vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Extend and shift unsigned 16-bit elements in <code>a</code> by <code>imm</code> to unsigned 32-bit result.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = (u32)(u16)a.half[i] &lt;&lt; imm;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vslt_b (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vslt_b (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vslt.b vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompare the signed 8-bit elements in `a` and `b`, store all-ones to `dst` if corresponding element in `a` is less than `b`, zero otherwise.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = ((s8)a.byte[i] < (s8)b.byte[i]) ? 0xFF : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vslt_b (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vslt.b vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare the signed 8-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than <code>b</code>, zero otherwise.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = ((s8)a.byte[i] &lt; (s8)b.byte[i]) ? 0xFF : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Comparison", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vslt_bu (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vslt_bu (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vslt.bu vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompare the unsigned 8-bit elements in `a` and `b`, store all-ones to `dst` if corresponding element in `a` is less than `b`, zero otherwise.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = ((u8)a.byte[i] < (u8)b.byte[i]) ? 0xFF : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vslt_bu (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vslt.bu vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare the unsigned 8-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than <code>b</code>, zero otherwise.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = ((u8)a.byte[i] &lt; (u8)b.byte[i]) ? 0xFF : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Comparison", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vslt_d (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vslt_d (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vslt.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompare the signed 64-bit elements in `a` and `b`, store all-ones to `dst` if corresponding element in `a` is less than `b`, zero otherwise.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = ((s64)a.dword[i] < (s64)b.dword[i]) ? 0xFFFFFFFFFFFFFFFF : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vslt_d (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vslt.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare the signed 64-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than <code>b</code>, zero otherwise.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = ((s64)a.dword[i] &lt; (s64)b.dword[i]) ? 0xFFFFFFFFFFFFFFFF : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Comparison", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vslt_du (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vslt_du (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vslt.du vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompare the unsigned 64-bit elements in `a` and `b`, store all-ones to `dst` if corresponding element in `a` is less than `b`, zero otherwise.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = ((u64)a.dword[i] < (u64)b.dword[i]) ? 0xFFFFFFFFFFFFFFFF : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vslt_du (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vslt.du vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare the unsigned 64-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than <code>b</code>, zero otherwise.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = ((u64)a.dword[i] &lt; (u64)b.dword[i]) ? 0xFFFFFFFFFFFFFFFF : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Comparison", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vslt_h (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vslt_h (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vslt.h vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompare the signed 16-bit elements in `a` and `b`, store all-ones to `dst` if corresponding element in `a` is less than `b`, zero otherwise.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = ((s16)a.half[i] < (s16)b.half[i]) ? 0xFFFF : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vslt_h (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vslt.h vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare the signed 16-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than <code>b</code>, zero otherwise.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = ((s16)a.half[i] &lt; (s16)b.half[i]) ? 0xFFFF : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Comparison", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vslt_hu (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vslt_hu (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vslt.hu vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompare the unsigned 16-bit elements in `a` and `b`, store all-ones to `dst` if corresponding element in `a` is less than `b`, zero otherwise.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = ((u16)a.half[i] < (u16)b.half[i]) ? 0xFFFF : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vslt_hu (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vslt.hu vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare the unsigned 16-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than <code>b</code>, zero otherwise.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = ((u16)a.half[i] &lt; (u16)b.half[i]) ? 0xFFFF : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Comparison", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vslt_w (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vslt_w (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vslt.w vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompare the signed 32-bit elements in `a` and `b`, store all-ones to `dst` if corresponding element in `a` is less than `b`, zero otherwise.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = ((s32)a.word[i] < (s32)b.word[i]) ? 0xFFFFFFFF : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vslt_w (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vslt.w vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare the signed 32-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than <code>b</code>, zero otherwise.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = ((s32)a.word[i] &lt; (s32)b.word[i]) ? 0xFFFFFFFF : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Comparison", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vslt_wu (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vslt_wu (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vslt.wu vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompare the unsigned 32-bit elements in `a` and `b`, store all-ones to `dst` if corresponding element in `a` is less than `b`, zero otherwise.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = ((u32)a.word[i] < (u32)b.word[i]) ? 0xFFFFFFFF : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vslt_wu (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vslt.wu vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare the unsigned 32-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than <code>b</code>, zero otherwise.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = ((u32)a.word[i] &lt; (u32)b.word[i]) ? 0xFFFFFFFF : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Comparison", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vslti_b (__m128i a, imm_n16_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vslti_b (__m128i a, imm_n16_15 imm)\n#include <lsxintrin.h>\nInstruction: vslti.b vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nCompare the signed 8-bit elements in `a` and `imm`, store all-ones to `dst` if corresponding element in `a` is less than `b`, zero otherwise.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = ((s8)a.byte[i] < imm) ? 0xFF : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vslti_b (__m128i a, imm_n16_15 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vslti.b vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare the signed 8-bit elements in <code>a</code> and <code>imm</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than <code>b</code>, zero otherwise.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = ((s8)a.byte[i] &lt; imm) ? 0xFF : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Comparison", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vslti_bu (__m128i a, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vslti_bu (__m128i a, imm0_31 imm)\n#include <lsxintrin.h>\nInstruction: vslti.bu vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nCompare the unsigned 8-bit elements in `a` and `imm`, store all-ones to `dst` if corresponding element in `a` is less than `b`, zero otherwise.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = ((u8)a.byte[i] < imm) ? 0xFF : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vslti_bu (__m128i a, imm0_31 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vslti.bu vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare the unsigned 8-bit elements in <code>a</code> and <code>imm</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than <code>b</code>, zero otherwise.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = ((u8)a.byte[i] &lt; imm) ? 0xFF : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Comparison", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vslti_d (__m128i a, imm_n16_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vslti_d (__m128i a, imm_n16_15 imm)\n#include <lsxintrin.h>\nInstruction: vslti.d vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nCompare the signed 64-bit elements in `a` and `imm`, store all-ones to `dst` if corresponding element in `a` is less than `b`, zero otherwise.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = ((s64)a.dword[i] < imm) ? 0xFFFFFFFFFFFFFFFF : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vslti_d (__m128i a, imm_n16_15 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vslti.d vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare the signed 64-bit elements in <code>a</code> and <code>imm</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than <code>b</code>, zero otherwise.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = ((s64)a.dword[i] &lt; imm) ? 0xFFFFFFFFFFFFFFFF : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Comparison", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vslti_du (__m128i a, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vslti_du (__m128i a, imm0_31 imm)\n#include <lsxintrin.h>\nInstruction: vslti.du vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nCompare the unsigned 64-bit elements in `a` and `imm`, store all-ones to `dst` if corresponding element in `a` is less than `b`, zero otherwise.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = ((u64)a.dword[i] < imm) ? 0xFFFFFFFFFFFFFFFF : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vslti_du (__m128i a, imm0_31 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vslti.du vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare the unsigned 64-bit elements in <code>a</code> and <code>imm</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than <code>b</code>, zero otherwise.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = ((u64)a.dword[i] &lt; imm) ? 0xFFFFFFFFFFFFFFFF : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Comparison", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vslti_h (__m128i a, imm_n16_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vslti_h (__m128i a, imm_n16_15 imm)\n#include <lsxintrin.h>\nInstruction: vslti.h vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nCompare the signed 16-bit elements in `a` and `imm`, store all-ones to `dst` if corresponding element in `a` is less than `b`, zero otherwise.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = ((s16)a.half[i] < imm) ? 0xFFFF : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vslti_h (__m128i a, imm_n16_15 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vslti.h vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare the signed 16-bit elements in <code>a</code> and <code>imm</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than <code>b</code>, zero otherwise.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = ((s16)a.half[i] &lt; imm) ? 0xFFFF : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Comparison", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vslti_hu (__m128i a, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vslti_hu (__m128i a, imm0_31 imm)\n#include <lsxintrin.h>\nInstruction: vslti.hu vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nCompare the unsigned 16-bit elements in `a` and `imm`, store all-ones to `dst` if corresponding element in `a` is less than `b`, zero otherwise.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = ((u16)a.half[i] < imm) ? 0xFFFF : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vslti_hu (__m128i a, imm0_31 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vslti.hu vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare the unsigned 16-bit elements in <code>a</code> and <code>imm</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than <code>b</code>, zero otherwise.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = ((u16)a.half[i] &lt; imm) ? 0xFFFF : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Comparison", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vslti_w (__m128i a, imm_n16_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vslti_w (__m128i a, imm_n16_15 imm)\n#include <lsxintrin.h>\nInstruction: vslti.w vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nCompare the signed 32-bit elements in `a` and `imm`, store all-ones to `dst` if corresponding element in `a` is less than `b`, zero otherwise.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = ((s32)a.word[i] < imm) ? 0xFFFFFFFF : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vslti_w (__m128i a, imm_n16_15 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vslti.w vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare the signed 32-bit elements in <code>a</code> and <code>imm</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than <code>b</code>, zero otherwise.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = ((s32)a.word[i] &lt; imm) ? 0xFFFFFFFF : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Comparison", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vslti_wu (__m128i a, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vslti_wu (__m128i a, imm0_31 imm)\n#include <lsxintrin.h>\nInstruction: vslti.wu vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nCompare the unsigned 32-bit elements in `a` and `imm`, store all-ones to `dst` if corresponding element in `a` is less than `b`, zero otherwise.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = ((u32)a.word[i] < imm) ? 0xFFFFFFFF : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vslti_wu (__m128i a, imm0_31 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vslti.wu vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare the unsigned 32-bit elements in <code>a</code> and <code>imm</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than <code>b</code>, zero otherwise.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = ((u32)a.word[i] &lt; imm) ? 0xFFFFFFFF : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Comparison", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsra_b (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsra_b (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vsra.b vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nArithmetic right shift the signed 8-bit elements in `a` by elements in `b`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = ((s8)a.byte[i]) >> (b.byte[i] & 0x7);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsra_b (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsra.b vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift the signed 8-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = ((s8)a.byte[i]) &gt;&gt; (b.byte[i] &amp; 0x7);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsra_d (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsra_d (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vsra.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nArithmetic right shift the signed 64-bit elements in `a` by elements in `b`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = ((s64)a.dword[i]) >> (b.dword[i] & 0x3f);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsra_d (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsra.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift the signed 64-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = ((s64)a.dword[i]) &gt;&gt; (b.dword[i] &amp; 0x3f);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsra_h (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsra_h (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vsra.h vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nArithmetic right shift the signed 16-bit elements in `a` by elements in `b`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = ((s16)a.half[i]) >> (b.half[i] & 0xf);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsra_h (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsra.h vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift the signed 16-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = ((s16)a.half[i]) &gt;&gt; (b.half[i] &amp; 0xf);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsra_w (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsra_w (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vsra.w vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nArithmetic right shift the signed 32-bit elements in `a` by elements in `b`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = ((s32)a.word[i]) >> (b.word[i] & 0x1f);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsra_w (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsra.w vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift the signed 32-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = ((s32)a.word[i]) &gt;&gt; (b.word[i] &amp; 0x1f);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsrai_b (__m128i a, imm0_7 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsrai_b (__m128i a, imm0_7 imm)\n#include <lsxintrin.h>\nInstruction: vsrai.b vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nArithmetic right shift the signed 8-bit elements in `a` by `imm`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = ((s8)a.byte[i]) >> imm;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsrai_b (__m128i a, imm0_7 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsrai.b vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift the signed 8-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = ((s8)a.byte[i]) &gt;&gt; imm;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsrai_d (__m128i a, imm0_63 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsrai_d (__m128i a, imm0_63 imm)\n#include <lsxintrin.h>\nInstruction: vsrai.d vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nArithmetic right shift the signed 64-bit elements in `a` by `imm`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = ((s64)a.dword[i]) >> imm;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsrai_d (__m128i a, imm0_63 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsrai.d vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift the signed 64-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = ((s64)a.dword[i]) &gt;&gt; imm;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsrai_h (__m128i a, imm0_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsrai_h (__m128i a, imm0_15 imm)\n#include <lsxintrin.h>\nInstruction: vsrai.h vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nArithmetic right shift the signed 16-bit elements in `a` by `imm`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = ((s16)a.half[i]) >> imm;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsrai_h (__m128i a, imm0_15 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsrai.h vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift the signed 16-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = ((s16)a.half[i]) &gt;&gt; imm;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsrai_w (__m128i a, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsrai_w (__m128i a, imm0_31 imm)\n#include <lsxintrin.h>\nInstruction: vsrai.w vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nArithmetic right shift the signed 32-bit elements in `a` by `imm`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = ((s32)a.word[i]) >> imm;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsrai_w (__m128i a, imm0_31 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsrai.w vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift the signed 32-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = ((s32)a.word[i]) &gt;&gt; imm;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsran_b_h (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsran_b_h (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vsran.b.h vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nArithmetic right shift the signed 16-bit elements in `a` by elements in `b`, truncate to 8-bit and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = (i < 8) ? (s8)((s16)a.half[i] >> (b.half[i] & 15)) : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsran_b_h (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsran.b.h vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift the signed 16-bit elements in <code>a</code> by elements in <code>b</code>, truncate to 8-bit and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = (i &lt; 8) ? (s8)((s16)a.half[i] &gt;&gt; (b.half[i] &amp; 15)) : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsran_h_w (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsran_h_w (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vsran.h.w vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nArithmetic right shift the signed 32-bit elements in `a` by elements in `b`, truncate to 16-bit and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = (i < 4) ? (s16)((s32)a.word[i] >> (b.word[i] & 31)) : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsran_h_w (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsran.h.w vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift the signed 32-bit elements in <code>a</code> by elements in <code>b</code>, truncate to 16-bit and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = (i &lt; 4) ? (s16)((s32)a.word[i] &gt;&gt; (b.word[i] &amp; 31)) : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsran_w_d (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsran_w_d (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vsran.w.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nArithmetic right shift the signed 64-bit elements in `a` by elements in `b`, truncate to 32-bit and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = (i < 2) ? (s32)((s64)a.dword[i] >> (b.dword[i] & 63)) : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsran_w_d (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsran.w.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift the signed 64-bit elements in <code>a</code> by elements in <code>b</code>, truncate to 32-bit and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = (i &lt; 2) ? (s32)((s64)a.dword[i] &gt;&gt; (b.dword[i] &amp; 63)) : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsrani_b_h (__m128i a, __m128i b, imm0_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsrani_b_h (__m128i a, __m128i b, imm0_15 imm)\n#include <lsxintrin.h>\nInstruction: vsrani.b.h vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nArithmetic right shift the signed 16-bit elements in `a` and `b` by `imm`, truncate to 8-bit and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] =\n      (i < 8) ? (s8)((s16)b.half[i] >> imm) : (s8)((s16)a.half[i - 8] >> imm);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsrani_b_h (__m128i a, __m128i b, imm0_15 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsrani.b.h vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift the signed 16-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, truncate to 8-bit and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] =\n      (i &lt; 8) ? (s8)((s16)b.half[i] &gt;&gt; imm) : (s8)((s16)a.half[i - 8] &gt;&gt; imm);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsrani_d_q (__m128i a, __m128i b, imm0_127 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsrani_d_q (__m128i a, __m128i b, imm0_127 imm)\n#include <lsxintrin.h>\nInstruction: vsrani.d.q vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nArithmetic right shift the signed 128-bit elements in `a` and `b` by `imm`, truncate to 64-bit and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = (i < 1) ? (s64)((s128)b.qword[i] >> imm)\n                         : (s64)((s128)a.qword[i - 1] >> imm);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsrani_d_q (__m128i a, __m128i b, imm0_127 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsrani.d.q vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift the signed 128-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, truncate to 64-bit and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = (i &lt; 1) ? (s64)((s128)b.qword[i] &gt;&gt; imm)\n                         : (s64)((s128)a.qword[i - 1] &gt;&gt; imm);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsrani_h_w (__m128i a, __m128i b, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsrani_h_w (__m128i a, __m128i b, imm0_31 imm)\n#include <lsxintrin.h>\nInstruction: vsrani.h.w vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nArithmetic right shift the signed 32-bit elements in `a` and `b` by `imm`, truncate to 16-bit and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] =\n      (i < 4) ? (s16)((s32)b.word[i] >> imm) : (s16)((s32)a.word[i - 4] >> imm);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsrani_h_w (__m128i a, __m128i b, imm0_31 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsrani.h.w vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift the signed 32-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, truncate to 16-bit and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] =\n      (i &lt; 4) ? (s16)((s32)b.word[i] &gt;&gt; imm) : (s16)((s32)a.word[i - 4] &gt;&gt; imm);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsrani_w_d (__m128i a, __m128i b, imm0_63 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsrani_w_d (__m128i a, __m128i b, imm0_63 imm)\n#include <lsxintrin.h>\nInstruction: vsrani.w.d vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nArithmetic right shift the signed 64-bit elements in `a` and `b` by `imm`, truncate to 32-bit and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = (i < 2) ? (s32)((s64)b.dword[i] >> imm)\n                        : (s32)((s64)a.dword[i - 2] >> imm);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsrani_w_d (__m128i a, __m128i b, imm0_63 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsrani.w.d vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift the signed 64-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, truncate to 32-bit and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = (i &lt; 2) ? (s32)((s64)b.dword[i] &gt;&gt; imm)\n                        : (s32)((s64)a.dword[i - 2] &gt;&gt; imm);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsrar_b (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsrar_b (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vsrar.b vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nArithmetic right shift (with rounding) the signed 8-bit elements in `a` by elements in `b`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  if ((b.byte[i] & 0x7) == 0) {\n    dst.byte[i] = a.byte[i];\n  } else {\n    dst.byte[i] = ((s8)a.byte[i] >> (b.byte[i] & 0x7)) +\n                  (((s8)a.byte[i] >> ((b.byte[i] & 0x7) - 1)) & 0x1);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsrar_b (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsrar.b vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift (with rounding) the signed 8-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  if ((b.byte[i] &amp; 0x7) == 0) {\n    dst.byte[i] = a.byte[i];\n  } else {\n    dst.byte[i] = ((s8)a.byte[i] &gt;&gt; (b.byte[i] &amp; 0x7)) +\n                  (((s8)a.byte[i] &gt;&gt; ((b.byte[i] &amp; 0x7) - 1)) &amp; 0x1);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsrar_d (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsrar_d (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vsrar.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nArithmetic right shift (with rounding) the signed 64-bit elements in `a` by elements in `b`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  if ((b.dword[i] & 0x3f) == 0) {\n    dst.dword[i] = a.dword[i];\n  } else {\n    dst.dword[i] = ((s64)a.dword[i] >> (b.dword[i] & 0x3f)) +\n                   (((s64)a.dword[i] >> ((b.dword[i] & 0x3f) - 1)) & 0x1);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsrar_d (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsrar.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift (with rounding) the signed 64-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  if ((b.dword[i] &amp; 0x3f) == 0) {\n    dst.dword[i] = a.dword[i];\n  } else {\n    dst.dword[i] = ((s64)a.dword[i] &gt;&gt; (b.dword[i] &amp; 0x3f)) +\n                   (((s64)a.dword[i] &gt;&gt; ((b.dword[i] &amp; 0x3f) - 1)) &amp; 0x1);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsrar_h (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsrar_h (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vsrar.h vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nArithmetic right shift (with rounding) the signed 16-bit elements in `a` by elements in `b`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  if ((b.half[i] & 0xf) == 0) {\n    dst.half[i] = a.half[i];\n  } else {\n    dst.half[i] = ((s16)a.half[i] >> (b.half[i] & 0xf)) +\n                  (((s16)a.half[i] >> ((b.half[i] & 0xf) - 1)) & 0x1);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsrar_h (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsrar.h vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift (with rounding) the signed 16-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  if ((b.half[i] &amp; 0xf) == 0) {\n    dst.half[i] = a.half[i];\n  } else {\n    dst.half[i] = ((s16)a.half[i] &gt;&gt; (b.half[i] &amp; 0xf)) +\n                  (((s16)a.half[i] &gt;&gt; ((b.half[i] &amp; 0xf) - 1)) &amp; 0x1);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsrar_w (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsrar_w (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vsrar.w vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nArithmetic right shift (with rounding) the signed 32-bit elements in `a` by elements in `b`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if ((b.word[i] & 0x1f) == 0) {\n    dst.word[i] = a.word[i];\n  } else {\n    dst.word[i] = ((s32)a.word[i] >> (b.word[i] & 0x1f)) +\n                  (((s32)a.word[i] >> ((b.word[i] & 0x1f) - 1)) & 0x1);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsrar_w (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsrar.w vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift (with rounding) the signed 32-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if ((b.word[i] &amp; 0x1f) == 0) {\n    dst.word[i] = a.word[i];\n  } else {\n    dst.word[i] = ((s32)a.word[i] &gt;&gt; (b.word[i] &amp; 0x1f)) +\n                  (((s32)a.word[i] &gt;&gt; ((b.word[i] &amp; 0x1f) - 1)) &amp; 0x1);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsrari_b (__m128i a, imm0_7 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsrari_b (__m128i a, imm0_7 imm)\n#include <lsxintrin.h>\nInstruction: vsrari.b vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nArithmetic right shift (with rounding) the signed 8-bit elements in `a` by `imm`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  if (imm == 0) {\n    dst.byte[i] = a.byte[i];\n  } else {\n    dst.byte[i] = ((s8)a.byte[i] >> imm) + (((s8)a.byte[i] >> (imm - 1)) & 0x1);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsrari_b (__m128i a, imm0_7 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsrari.b vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift (with rounding) the signed 8-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  if (imm == 0) {\n    dst.byte[i] = a.byte[i];\n  } else {\n    dst.byte[i] = ((s8)a.byte[i] &gt;&gt; imm) + (((s8)a.byte[i] &gt;&gt; (imm - 1)) &amp; 0x1);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsrari_d (__m128i a, imm0_63 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsrari_d (__m128i a, imm0_63 imm)\n#include <lsxintrin.h>\nInstruction: vsrari.d vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nArithmetic right shift (with rounding) the signed 64-bit elements in `a` by `imm`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  if (imm == 0) {\n    dst.dword[i] = a.dword[i];\n  } else {\n    dst.dword[i] =\n        ((s64)a.dword[i] >> imm) + (((s64)a.dword[i] >> (imm - 1)) & 0x1);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsrari_d (__m128i a, imm0_63 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsrari.d vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift (with rounding) the signed 64-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  if (imm == 0) {\n    dst.dword[i] = a.dword[i];\n  } else {\n    dst.dword[i] =\n        ((s64)a.dword[i] &gt;&gt; imm) + (((s64)a.dword[i] &gt;&gt; (imm - 1)) &amp; 0x1);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsrari_h (__m128i a, imm0_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsrari_h (__m128i a, imm0_15 imm)\n#include <lsxintrin.h>\nInstruction: vsrari.h vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nArithmetic right shift (with rounding) the signed 16-bit elements in `a` by `imm`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  if (imm == 0) {\n    dst.half[i] = a.half[i];\n  } else {\n    dst.half[i] =\n        ((s16)a.half[i] >> imm) + (((s16)a.half[i] >> (imm - 1)) & 0x1);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsrari_h (__m128i a, imm0_15 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsrari.h vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift (with rounding) the signed 16-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  if (imm == 0) {\n    dst.half[i] = a.half[i];\n  } else {\n    dst.half[i] =\n        ((s16)a.half[i] &gt;&gt; imm) + (((s16)a.half[i] &gt;&gt; (imm - 1)) &amp; 0x1);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsrari_w (__m128i a, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsrari_w (__m128i a, imm0_31 imm)\n#include <lsxintrin.h>\nInstruction: vsrari.w vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nArithmetic right shift (with rounding) the signed 32-bit elements in `a` by `imm`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (imm == 0) {\n    dst.word[i] = a.word[i];\n  } else {\n    dst.word[i] =\n        ((s32)a.word[i] >> imm) + (((s32)a.word[i] >> (imm - 1)) & 0x1);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsrari_w (__m128i a, imm0_31 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsrari.w vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift (with rounding) the signed 32-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (imm == 0) {\n    dst.word[i] = a.word[i];\n  } else {\n    dst.word[i] =\n        ((s32)a.word[i] &gt;&gt; imm) + (((s32)a.word[i] &gt;&gt; (imm - 1)) &amp; 0x1);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsrarn_b_h (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsrarn_b_h (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vsrarn.b.h vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nArithmetic right shift (with rounding) the signed 16-bit elements in `a` by elements in `b`, truncate to 8-bit and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  if (i < 8) {\n    u8 shift = (b.half[i] & 15);\n    if (shift == 0) {\n      dst.byte[i] = (s8)(s16)a.half[i];\n    } else {\n      dst.byte[i] = (s8)(((s16)a.half[i] >> shift) +\n                         (((s16)a.half[i] >> (shift - 1)) & 0x1));\n    }\n  } else {\n    dst.byte[i] = 0;\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsrarn_b_h (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsrarn.b.h vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift (with rounding) the signed 16-bit elements in <code>a</code> by elements in <code>b</code>, truncate to 8-bit and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  if (i &lt; 8) {\n    u8 shift = (b.half[i] &amp; 15);\n    if (shift == 0) {\n      dst.byte[i] = (s8)(s16)a.half[i];\n    } else {\n      dst.byte[i] = (s8)(((s16)a.half[i] &gt;&gt; shift) +\n                         (((s16)a.half[i] &gt;&gt; (shift - 1)) &amp; 0x1));\n    }\n  } else {\n    dst.byte[i] = 0;\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsrarn_h_w (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsrarn_h_w (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vsrarn.h.w vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nArithmetic right shift (with rounding) the signed 32-bit elements in `a` by elements in `b`, truncate to 16-bit and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  if (i < 4) {\n    u8 shift = (b.word[i] & 31);\n    if (shift == 0) {\n      dst.half[i] = (s16)(s32)a.word[i];\n    } else {\n      dst.half[i] = (s16)(((s32)a.word[i] >> shift) +\n                          (((s32)a.word[i] >> (shift - 1)) & 0x1));\n    }\n  } else {\n    dst.half[i] = 0;\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsrarn_h_w (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsrarn.h.w vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift (with rounding) the signed 32-bit elements in <code>a</code> by elements in <code>b</code>, truncate to 16-bit and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  if (i &lt; 4) {\n    u8 shift = (b.word[i] &amp; 31);\n    if (shift == 0) {\n      dst.half[i] = (s16)(s32)a.word[i];\n    } else {\n      dst.half[i] = (s16)(((s32)a.word[i] &gt;&gt; shift) +\n                          (((s32)a.word[i] &gt;&gt; (shift - 1)) &amp; 0x1));\n    }\n  } else {\n    dst.half[i] = 0;\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsrarn_w_d (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsrarn_w_d (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vsrarn.w.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nArithmetic right shift (with rounding) the signed 64-bit elements in `a` by elements in `b`, truncate to 32-bit and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (i < 2) {\n    u8 shift = (b.dword[i] & 63);\n    if (shift == 0) {\n      dst.word[i] = (s32)(s64)a.dword[i];\n    } else {\n      dst.word[i] = (s32)(((s64)a.dword[i] >> shift) +\n                          (((s64)a.dword[i] >> (shift - 1)) & 0x1));\n    }\n  } else {\n    dst.word[i] = 0;\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsrarn_w_d (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsrarn.w.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift (with rounding) the signed 64-bit elements in <code>a</code> by elements in <code>b</code>, truncate to 32-bit and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (i &lt; 2) {\n    u8 shift = (b.dword[i] &amp; 63);\n    if (shift == 0) {\n      dst.word[i] = (s32)(s64)a.dword[i];\n    } else {\n      dst.word[i] = (s32)(((s64)a.dword[i] &gt;&gt; shift) +\n                          (((s64)a.dword[i] &gt;&gt; (shift - 1)) &amp; 0x1));\n    }\n  } else {\n    dst.word[i] = 0;\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsrarni_b_h (__m128i a, __m128i b, imm0_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsrarni_b_h (__m128i a, __m128i b, imm0_15 imm)\n#include <lsxintrin.h>\nInstruction: vsrarni.b.h vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nArithmetic right shift (with rounding) the signed 16-bit elements in `a` and `b` by `imm`, truncate to 8-bit and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  if (i < 8) {\n    if (imm == 0) {\n      dst.byte[i] = (s8)(s16)b.half[i];\n    } else {\n      dst.byte[i] =\n          (s8)(((s16)b.half[i] >> imm) + (((s16)b.half[i] >> (imm - 1)) & 0x1));\n    }\n  } else {\n    if (imm == 0) {\n      dst.byte[i] = (s8)(s16)a.half[i - 8];\n    } else {\n      dst.byte[i] = (s8)(((s16)a.half[i - 8] >> imm) +\n                         (((s16)a.half[i - 8] >> (imm - 1)) & 0x1));\n    }\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsrarni_b_h (__m128i a, __m128i b, imm0_15 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsrarni.b.h vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift (with rounding) the signed 16-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, truncate to 8-bit and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  if (i &lt; 8) {\n    if (imm == 0) {\n      dst.byte[i] = (s8)(s16)b.half[i];\n    } else {\n      dst.byte[i] =\n          (s8)(((s16)b.half[i] &gt;&gt; imm) + (((s16)b.half[i] &gt;&gt; (imm - 1)) &amp; 0x1));\n    }\n  } else {\n    if (imm == 0) {\n      dst.byte[i] = (s8)(s16)a.half[i - 8];\n    } else {\n      dst.byte[i] = (s8)(((s16)a.half[i - 8] &gt;&gt; imm) +\n                         (((s16)a.half[i - 8] &gt;&gt; (imm - 1)) &amp; 0x1));\n    }\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsrarni_d_q (__m128i a, __m128i b, imm0_127 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsrarni_d_q (__m128i a, __m128i b, imm0_127 imm)\n#include <lsxintrin.h>\nInstruction: vsrarni.d.q vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nArithmetic right shift (with rounding) the signed 128-bit elements in `a` and `b` by `imm`, truncate to 64-bit and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  if (i < 1) {\n    if (imm == 0) {\n      dst.dword[i] = (s64)(s128)b.qword[i];\n    } else {\n      dst.dword[i] = (s64)(((s128)b.qword[i] >> imm) +\n                           (((s128)b.qword[i] >> (imm - 1)) & 0x1));\n    }\n  } else {\n    if (imm == 0) {\n      dst.dword[i] = (s64)(s128)a.qword[i - 1];\n    } else {\n      dst.dword[i] = (s64)(((s128)a.qword[i - 1] >> imm) +\n                           (((s128)a.qword[i - 1] >> (imm - 1)) & 0x1));\n    }\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsrarni_d_q (__m128i a, __m128i b, imm0_127 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsrarni.d.q vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift (with rounding) the signed 128-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, truncate to 64-bit and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  if (i &lt; 1) {\n    if (imm == 0) {\n      dst.dword[i] = (s64)(s128)b.qword[i];\n    } else {\n      dst.dword[i] = (s64)(((s128)b.qword[i] &gt;&gt; imm) +\n                           (((s128)b.qword[i] &gt;&gt; (imm - 1)) &amp; 0x1));\n    }\n  } else {\n    if (imm == 0) {\n      dst.dword[i] = (s64)(s128)a.qword[i - 1];\n    } else {\n      dst.dword[i] = (s64)(((s128)a.qword[i - 1] &gt;&gt; imm) +\n                           (((s128)a.qword[i - 1] &gt;&gt; (imm - 1)) &amp; 0x1));\n    }\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsrarni_h_w (__m128i a, __m128i b, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsrarni_h_w (__m128i a, __m128i b, imm0_31 imm)\n#include <lsxintrin.h>\nInstruction: vsrarni.h.w vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nArithmetic right shift (with rounding) the signed 32-bit elements in `a` and `b` by `imm`, truncate to 16-bit and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  if (i < 4) {\n    if (imm == 0) {\n      dst.half[i] = (s16)(s32)b.word[i];\n    } else {\n      dst.half[i] = (s16)(((s32)b.word[i] >> imm) +\n                          (((s32)b.word[i] >> (imm - 1)) & 0x1));\n    }\n  } else {\n    if (imm == 0) {\n      dst.half[i] = (s16)(s32)a.word[i - 4];\n    } else {\n      dst.half[i] = (s16)(((s32)a.word[i - 4] >> imm) +\n                          (((s32)a.word[i - 4] >> (imm - 1)) & 0x1));\n    }\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsrarni_h_w (__m128i a, __m128i b, imm0_31 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsrarni.h.w vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift (with rounding) the signed 32-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, truncate to 16-bit and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  if (i &lt; 4) {\n    if (imm == 0) {\n      dst.half[i] = (s16)(s32)b.word[i];\n    } else {\n      dst.half[i] = (s16)(((s32)b.word[i] &gt;&gt; imm) +\n                          (((s32)b.word[i] &gt;&gt; (imm - 1)) &amp; 0x1));\n    }\n  } else {\n    if (imm == 0) {\n      dst.half[i] = (s16)(s32)a.word[i - 4];\n    } else {\n      dst.half[i] = (s16)(((s32)a.word[i - 4] &gt;&gt; imm) +\n                          (((s32)a.word[i - 4] &gt;&gt; (imm - 1)) &amp; 0x1));\n    }\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsrarni_w_d (__m128i a, __m128i b, imm0_63 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsrarni_w_d (__m128i a, __m128i b, imm0_63 imm)\n#include <lsxintrin.h>\nInstruction: vsrarni.w.d vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nArithmetic right shift (with rounding) the signed 64-bit elements in `a` and `b` by `imm`, truncate to 32-bit and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (i < 2) {\n    if (imm == 0) {\n      dst.word[i] = (s32)(s64)b.dword[i];\n    } else {\n      dst.word[i] = (s32)(((s64)b.dword[i] >> imm) +\n                          (((s64)b.dword[i] >> (imm - 1)) & 0x1));\n    }\n  } else {\n    if (imm == 0) {\n      dst.word[i] = (s32)(s64)a.dword[i - 2];\n    } else {\n      dst.word[i] = (s32)(((s64)a.dword[i - 2] >> imm) +\n                          (((s64)a.dword[i - 2] >> (imm - 1)) & 0x1));\n    }\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsrarni_w_d (__m128i a, __m128i b, imm0_63 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsrarni.w.d vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift (with rounding) the signed 64-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, truncate to 32-bit and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (i &lt; 2) {\n    if (imm == 0) {\n      dst.word[i] = (s32)(s64)b.dword[i];\n    } else {\n      dst.word[i] = (s32)(((s64)b.dword[i] &gt;&gt; imm) +\n                          (((s64)b.dword[i] &gt;&gt; (imm - 1)) &amp; 0x1));\n    }\n  } else {\n    if (imm == 0) {\n      dst.word[i] = (s32)(s64)a.dword[i - 2];\n    } else {\n      dst.word[i] = (s32)(((s64)a.dword[i - 2] &gt;&gt; imm) +\n                          (((s64)a.dword[i - 2] &gt;&gt; (imm - 1)) &amp; 0x1));\n    }\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsrl_b (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsrl_b (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vsrl.b vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nLogical right shift the unsigned 8-bit elements in `a` by elements in `b`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = a.byte[i] >> (b.byte[i] & 0x7);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsrl_b (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsrl.b vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift the unsigned 8-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = a.byte[i] &gt;&gt; (b.byte[i] &amp; 0x7);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsrl_d (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsrl_d (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vsrl.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nLogical right shift the unsigned 64-bit elements in `a` by elements in `b`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = a.dword[i] >> (b.dword[i] & 0x3f);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsrl_d (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsrl.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift the unsigned 64-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = a.dword[i] &gt;&gt; (b.dword[i] &amp; 0x3f);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsrl_h (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsrl_h (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vsrl.h vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nLogical right shift the unsigned 16-bit elements in `a` by elements in `b`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = a.half[i] >> (b.half[i] & 0xf);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsrl_h (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsrl.h vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift the unsigned 16-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = a.half[i] &gt;&gt; (b.half[i] &amp; 0xf);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsrl_w (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsrl_w (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vsrl.w vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nLogical right shift the unsigned 32-bit elements in `a` by elements in `b`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = a.word[i] >> (b.word[i] & 0x1f);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsrl_w (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsrl.w vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift the unsigned 32-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = a.word[i] &gt;&gt; (b.word[i] &amp; 0x1f);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsrli_b (__m128i a, imm0_7 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsrli_b (__m128i a, imm0_7 imm)\n#include <lsxintrin.h>\nInstruction: vsrli.b vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nLogical right shift the unsigned 8-bit elements in `a` by `imm`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = a.byte[i] >> imm;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsrli_b (__m128i a, imm0_7 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsrli.b vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift the unsigned 8-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = a.byte[i] &gt;&gt; imm;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsrli_d (__m128i a, imm0_63 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsrli_d (__m128i a, imm0_63 imm)\n#include <lsxintrin.h>\nInstruction: vsrli.d vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nLogical right shift the unsigned 64-bit elements in `a` by `imm`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = a.dword[i] >> imm;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsrli_d (__m128i a, imm0_63 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsrli.d vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift the unsigned 64-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = a.dword[i] &gt;&gt; imm;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsrli_h (__m128i a, imm0_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsrli_h (__m128i a, imm0_15 imm)\n#include <lsxintrin.h>\nInstruction: vsrli.h vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nLogical right shift the unsigned 16-bit elements in `a` by `imm`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = a.half[i] >> imm;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsrli_h (__m128i a, imm0_15 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsrli.h vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift the unsigned 16-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = a.half[i] &gt;&gt; imm;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsrli_w (__m128i a, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsrli_w (__m128i a, imm0_31 imm)\n#include <lsxintrin.h>\nInstruction: vsrli.w vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nLogical right shift the unsigned 32-bit elements in `a` by `imm`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = a.word[i] >> imm;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsrli_w (__m128i a, imm0_31 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsrli.w vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift the unsigned 32-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = a.word[i] &gt;&gt; imm;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsrln_b_h (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsrln_b_h (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vsrln.b.h vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nLogical right shift the unsigned 16-bit elements in `a` by elements in `b`, truncate to 8-bit and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = (i < 8) ? (u8)((u16)a.half[i] >> (b.half[i] & 15)) : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsrln_b_h (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsrln.b.h vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift the unsigned 16-bit elements in <code>a</code> by elements in <code>b</code>, truncate to 8-bit and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = (i &lt; 8) ? (u8)((u16)a.half[i] &gt;&gt; (b.half[i] &amp; 15)) : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsrln_h_w (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsrln_h_w (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vsrln.h.w vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nLogical right shift the unsigned 32-bit elements in `a` by elements in `b`, truncate to 16-bit and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = (i < 4) ? (u16)((u32)a.word[i] >> (b.word[i] & 31)) : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsrln_h_w (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsrln.h.w vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift the unsigned 32-bit elements in <code>a</code> by elements in <code>b</code>, truncate to 16-bit and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = (i &lt; 4) ? (u16)((u32)a.word[i] &gt;&gt; (b.word[i] &amp; 31)) : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsrln_w_d (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsrln_w_d (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vsrln.w.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nLogical right shift the unsigned 64-bit elements in `a` by elements in `b`, truncate to 32-bit and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = (i < 2) ? (u32)((u64)a.dword[i] >> (b.dword[i] & 63)) : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsrln_w_d (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsrln.w.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift the unsigned 64-bit elements in <code>a</code> by elements in <code>b</code>, truncate to 32-bit and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = (i &lt; 2) ? (u32)((u64)a.dword[i] &gt;&gt; (b.dword[i] &amp; 63)) : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsrlni_b_h (__m128i a, __m128i b, imm0_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsrlni_b_h (__m128i a, __m128i b, imm0_15 imm)\n#include <lsxintrin.h>\nInstruction: vsrlni.b.h vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nLogical right shift the unsigned 16-bit elements in `a` and `b` by `imm`, truncate to 8-bit and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] =\n      (i < 8) ? (u8)((u16)b.half[i] >> imm) : (u8)((u16)a.half[i - 8] >> imm);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsrlni_b_h (__m128i a, __m128i b, imm0_15 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsrlni.b.h vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift the unsigned 16-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, truncate to 8-bit and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] =\n      (i &lt; 8) ? (u8)((u16)b.half[i] &gt;&gt; imm) : (u8)((u16)a.half[i - 8] &gt;&gt; imm);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsrlni_d_q (__m128i a, __m128i b, imm0_127 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsrlni_d_q (__m128i a, __m128i b, imm0_127 imm)\n#include <lsxintrin.h>\nInstruction: vsrlni.d.q vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nLogical right shift the unsigned 128-bit elements in `a` and `b` by `imm`, truncate to 64-bit and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = (i < 1) ? (u64)((u128)b.qword[i] >> imm)\n                         : (u64)((u128)a.qword[i - 1] >> imm);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsrlni_d_q (__m128i a, __m128i b, imm0_127 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsrlni.d.q vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift the unsigned 128-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, truncate to 64-bit and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = (i &lt; 1) ? (u64)((u128)b.qword[i] &gt;&gt; imm)\n                         : (u64)((u128)a.qword[i - 1] &gt;&gt; imm);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsrlni_h_w (__m128i a, __m128i b, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsrlni_h_w (__m128i a, __m128i b, imm0_31 imm)\n#include <lsxintrin.h>\nInstruction: vsrlni.h.w vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nLogical right shift the unsigned 32-bit elements in `a` and `b` by `imm`, truncate to 16-bit and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] =\n      (i < 4) ? (u16)((u32)b.word[i] >> imm) : (u16)((u32)a.word[i - 4] >> imm);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsrlni_h_w (__m128i a, __m128i b, imm0_31 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsrlni.h.w vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift the unsigned 32-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, truncate to 16-bit and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] =\n      (i &lt; 4) ? (u16)((u32)b.word[i] &gt;&gt; imm) : (u16)((u32)a.word[i - 4] &gt;&gt; imm);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsrlni_w_d (__m128i a, __m128i b, imm0_63 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsrlni_w_d (__m128i a, __m128i b, imm0_63 imm)\n#include <lsxintrin.h>\nInstruction: vsrlni.w.d vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nLogical right shift the unsigned 64-bit elements in `a` and `b` by `imm`, truncate to 32-bit and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = (i < 2) ? (u32)((u64)b.dword[i] >> imm)\n                        : (u32)((u64)a.dword[i - 2] >> imm);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsrlni_w_d (__m128i a, __m128i b, imm0_63 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsrlni.w.d vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift the unsigned 64-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, truncate to 32-bit and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = (i &lt; 2) ? (u32)((u64)b.dword[i] &gt;&gt; imm)\n                        : (u32)((u64)a.dword[i - 2] &gt;&gt; imm);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsrlr_b (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsrlr_b (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vsrlr.b vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nLogical right shift (with rounding) the unsigned 8-bit elements in `a` by elements in `b`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  if ((b.byte[i] & 0x7) == 0) {\n    dst.byte[i] = a.byte[i];\n  } else {\n    dst.byte[i] = (a.byte[i] >> (b.byte[i] & 0x7)) +\n                  ((a.byte[i] >> ((b.byte[i] & 0x7) - 1)) & 0x1);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsrlr_b (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsrlr.b vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift (with rounding) the unsigned 8-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  if ((b.byte[i] &amp; 0x7) == 0) {\n    dst.byte[i] = a.byte[i];\n  } else {\n    dst.byte[i] = (a.byte[i] &gt;&gt; (b.byte[i] &amp; 0x7)) +\n                  ((a.byte[i] &gt;&gt; ((b.byte[i] &amp; 0x7) - 1)) &amp; 0x1);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsrlr_d (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsrlr_d (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vsrlr.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nLogical right shift (with rounding) the unsigned 64-bit elements in `a` by elements in `b`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  if ((b.dword[i] & 0x3f) == 0) {\n    dst.dword[i] = a.dword[i];\n  } else {\n    dst.dword[i] = (a.dword[i] >> (b.dword[i] & 0x3f)) +\n                   ((a.dword[i] >> ((b.dword[i] & 0x3f) - 1)) & 0x1);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsrlr_d (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsrlr.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift (with rounding) the unsigned 64-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  if ((b.dword[i] &amp; 0x3f) == 0) {\n    dst.dword[i] = a.dword[i];\n  } else {\n    dst.dword[i] = (a.dword[i] &gt;&gt; (b.dword[i] &amp; 0x3f)) +\n                   ((a.dword[i] &gt;&gt; ((b.dword[i] &amp; 0x3f) - 1)) &amp; 0x1);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsrlr_h (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsrlr_h (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vsrlr.h vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nLogical right shift (with rounding) the unsigned 16-bit elements in `a` by elements in `b`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  if ((b.half[i] & 0xf) == 0) {\n    dst.half[i] = a.half[i];\n  } else {\n    dst.half[i] = (a.half[i] >> (b.half[i] & 0xf)) +\n                  ((a.half[i] >> ((b.half[i] & 0xf) - 1)) & 0x1);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsrlr_h (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsrlr.h vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift (with rounding) the unsigned 16-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  if ((b.half[i] &amp; 0xf) == 0) {\n    dst.half[i] = a.half[i];\n  } else {\n    dst.half[i] = (a.half[i] &gt;&gt; (b.half[i] &amp; 0xf)) +\n                  ((a.half[i] &gt;&gt; ((b.half[i] &amp; 0xf) - 1)) &amp; 0x1);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsrlr_w (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsrlr_w (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vsrlr.w vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nLogical right shift (with rounding) the unsigned 32-bit elements in `a` by elements in `b`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if ((b.word[i] & 0x1f) == 0) {\n    dst.word[i] = a.word[i];\n  } else {\n    dst.word[i] = (a.word[i] >> (b.word[i] & 0x1f)) +\n                  ((a.word[i] >> ((b.word[i] & 0x1f) - 1)) & 0x1);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsrlr_w (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsrlr.w vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift (with rounding) the unsigned 32-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if ((b.word[i] &amp; 0x1f) == 0) {\n    dst.word[i] = a.word[i];\n  } else {\n    dst.word[i] = (a.word[i] &gt;&gt; (b.word[i] &amp; 0x1f)) +\n                  ((a.word[i] &gt;&gt; ((b.word[i] &amp; 0x1f) - 1)) &amp; 0x1);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsrlri_b (__m128i a, imm0_7 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsrlri_b (__m128i a, imm0_7 imm)\n#include <lsxintrin.h>\nInstruction: vsrlri.b vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nLogical right shift (with rounding) the unsigned 8-bit elements in `a` by `imm`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  if (imm == 0) {\n    dst.byte[i] = a.byte[i];\n  } else {\n    dst.byte[i] = (a.byte[i] >> imm) + ((a.byte[i] >> (imm - 1)) & 0x1);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsrlri_b (__m128i a, imm0_7 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsrlri.b vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift (with rounding) the unsigned 8-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  if (imm == 0) {\n    dst.byte[i] = a.byte[i];\n  } else {\n    dst.byte[i] = (a.byte[i] &gt;&gt; imm) + ((a.byte[i] &gt;&gt; (imm - 1)) &amp; 0x1);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsrlri_d (__m128i a, imm0_63 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsrlri_d (__m128i a, imm0_63 imm)\n#include <lsxintrin.h>\nInstruction: vsrlri.d vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nLogical right shift (with rounding) the unsigned 64-bit elements in `a` by `imm`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  if (imm == 0) {\n    dst.dword[i] = a.dword[i];\n  } else {\n    dst.dword[i] = (a.dword[i] >> imm) + ((a.dword[i] >> (imm - 1)) & 0x1);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsrlri_d (__m128i a, imm0_63 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsrlri.d vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift (with rounding) the unsigned 64-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  if (imm == 0) {\n    dst.dword[i] = a.dword[i];\n  } else {\n    dst.dword[i] = (a.dword[i] &gt;&gt; imm) + ((a.dword[i] &gt;&gt; (imm - 1)) &amp; 0x1);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsrlri_h (__m128i a, imm0_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsrlri_h (__m128i a, imm0_15 imm)\n#include <lsxintrin.h>\nInstruction: vsrlri.h vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nLogical right shift (with rounding) the unsigned 16-bit elements in `a` by `imm`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  if (imm == 0) {\n    dst.half[i] = a.half[i];\n  } else {\n    dst.half[i] = (a.half[i] >> imm) + ((a.half[i] >> (imm - 1)) & 0x1);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsrlri_h (__m128i a, imm0_15 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsrlri.h vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift (with rounding) the unsigned 16-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  if (imm == 0) {\n    dst.half[i] = a.half[i];\n  } else {\n    dst.half[i] = (a.half[i] &gt;&gt; imm) + ((a.half[i] &gt;&gt; (imm - 1)) &amp; 0x1);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsrlri_w (__m128i a, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsrlri_w (__m128i a, imm0_31 imm)\n#include <lsxintrin.h>\nInstruction: vsrlri.w vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nLogical right shift (with rounding) the unsigned 32-bit elements in `a` by `imm`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (imm == 0) {\n    dst.word[i] = a.word[i];\n  } else {\n    dst.word[i] = (a.word[i] >> imm) + ((a.word[i] >> (imm - 1)) & 0x1);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsrlri_w (__m128i a, imm0_31 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsrlri.w vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift (with rounding) the unsigned 32-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (imm == 0) {\n    dst.word[i] = a.word[i];\n  } else {\n    dst.word[i] = (a.word[i] &gt;&gt; imm) + ((a.word[i] &gt;&gt; (imm - 1)) &amp; 0x1);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsrlrn_b_h (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsrlrn_b_h (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vsrlrn.b.h vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nLogical right shift (with rounding) the unsigned 16-bit elements in `a` by elements in `b`, truncate to 8-bit and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  if (i < 8) {\n    u8 shift = (b.half[i] & 15);\n    if (shift == 0) {\n      dst.byte[i] = (u8)(u16)a.half[i];\n    } else {\n      dst.byte[i] = (u8)(((u16)a.half[i] >> shift) +\n                         (((u16)a.half[i] >> (shift - 1)) & 0x1));\n    }\n  } else {\n    dst.byte[i] = 0;\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsrlrn_b_h (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsrlrn.b.h vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift (with rounding) the unsigned 16-bit elements in <code>a</code> by elements in <code>b</code>, truncate to 8-bit and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  if (i &lt; 8) {\n    u8 shift = (b.half[i] &amp; 15);\n    if (shift == 0) {\n      dst.byte[i] = (u8)(u16)a.half[i];\n    } else {\n      dst.byte[i] = (u8)(((u16)a.half[i] &gt;&gt; shift) +\n                         (((u16)a.half[i] &gt;&gt; (shift - 1)) &amp; 0x1));\n    }\n  } else {\n    dst.byte[i] = 0;\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsrlrn_h_w (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsrlrn_h_w (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vsrlrn.h.w vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nLogical right shift (with rounding) the unsigned 32-bit elements in `a` by elements in `b`, truncate to 16-bit and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  if (i < 4) {\n    u8 shift = (b.word[i] & 31);\n    if (shift == 0) {\n      dst.half[i] = (u16)(u32)a.word[i];\n    } else {\n      dst.half[i] = (u16)(((u32)a.word[i] >> shift) +\n                          (((u32)a.word[i] >> (shift - 1)) & 0x1));\n    }\n  } else {\n    dst.half[i] = 0;\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsrlrn_h_w (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsrlrn.h.w vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift (with rounding) the unsigned 32-bit elements in <code>a</code> by elements in <code>b</code>, truncate to 16-bit and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  if (i &lt; 4) {\n    u8 shift = (b.word[i] &amp; 31);\n    if (shift == 0) {\n      dst.half[i] = (u16)(u32)a.word[i];\n    } else {\n      dst.half[i] = (u16)(((u32)a.word[i] &gt;&gt; shift) +\n                          (((u32)a.word[i] &gt;&gt; (shift - 1)) &amp; 0x1));\n    }\n  } else {\n    dst.half[i] = 0;\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsrlrn_w_d (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsrlrn_w_d (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vsrlrn.w.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nLogical right shift (with rounding) the unsigned 64-bit elements in `a` by elements in `b`, truncate to 32-bit and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (i < 2) {\n    u8 shift = (b.dword[i] & 63);\n    if (shift == 0) {\n      dst.word[i] = (u32)(u64)a.dword[i];\n    } else {\n      dst.word[i] = (u32)(((u64)a.dword[i] >> shift) +\n                          (((u64)a.dword[i] >> (shift - 1)) & 0x1));\n    }\n  } else {\n    dst.word[i] = 0;\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsrlrn_w_d (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsrlrn.w.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift (with rounding) the unsigned 64-bit elements in <code>a</code> by elements in <code>b</code>, truncate to 32-bit and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (i &lt; 2) {\n    u8 shift = (b.dword[i] &amp; 63);\n    if (shift == 0) {\n      dst.word[i] = (u32)(u64)a.dword[i];\n    } else {\n      dst.word[i] = (u32)(((u64)a.dword[i] &gt;&gt; shift) +\n                          (((u64)a.dword[i] &gt;&gt; (shift - 1)) &amp; 0x1));\n    }\n  } else {\n    dst.word[i] = 0;\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsrlrni_b_h (__m128i a, __m128i b, imm0_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsrlrni_b_h (__m128i a, __m128i b, imm0_15 imm)\n#include <lsxintrin.h>\nInstruction: vsrlrni.b.h vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nLogical right shift (with rounding) the unsigned 16-bit elements in `a` and `b` by `imm`, truncate to 8-bit and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  if (i < 8) {\n    if (imm == 0) {\n      dst.byte[i] = (u8)(u16)b.half[i];\n    } else {\n      dst.byte[i] =\n          (u8)(((u16)b.half[i] >> imm) + (((u16)b.half[i] >> (imm - 1)) & 0x1));\n    }\n  } else {\n    if (imm == 0) {\n      dst.byte[i] = (u8)(u16)a.half[i - 8];\n    } else {\n      dst.byte[i] = (u8)(((u16)a.half[i - 8] >> imm) +\n                         (((u16)a.half[i - 8] >> (imm - 1)) & 0x1));\n    }\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsrlrni_b_h (__m128i a, __m128i b, imm0_15 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsrlrni.b.h vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift (with rounding) the unsigned 16-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, truncate to 8-bit and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  if (i &lt; 8) {\n    if (imm == 0) {\n      dst.byte[i] = (u8)(u16)b.half[i];\n    } else {\n      dst.byte[i] =\n          (u8)(((u16)b.half[i] &gt;&gt; imm) + (((u16)b.half[i] &gt;&gt; (imm - 1)) &amp; 0x1));\n    }\n  } else {\n    if (imm == 0) {\n      dst.byte[i] = (u8)(u16)a.half[i - 8];\n    } else {\n      dst.byte[i] = (u8)(((u16)a.half[i - 8] &gt;&gt; imm) +\n                         (((u16)a.half[i - 8] &gt;&gt; (imm - 1)) &amp; 0x1));\n    }\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsrlrni_d_q (__m128i a, __m128i b, imm0_127 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsrlrni_d_q (__m128i a, __m128i b, imm0_127 imm)\n#include <lsxintrin.h>\nInstruction: vsrlrni.d.q vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nLogical right shift (with rounding) the unsigned 128-bit elements in `a` and `b` by `imm`, truncate to 64-bit and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  if (i < 1) {\n    if (imm == 0) {\n      dst.dword[i] = (u64)(u128)b.qword[i];\n    } else {\n      dst.dword[i] = (u64)(((u128)b.qword[i] >> imm) +\n                           (((u128)b.qword[i] >> (imm - 1)) & 0x1));\n    }\n  } else {\n    if (imm == 0) {\n      dst.dword[i] = (u64)(u128)a.qword[i - 1];\n    } else {\n      dst.dword[i] = (u64)(((u128)a.qword[i - 1] >> imm) +\n                           (((u128)a.qword[i - 1] >> (imm - 1)) & 0x1));\n    }\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsrlrni_d_q (__m128i a, __m128i b, imm0_127 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsrlrni.d.q vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift (with rounding) the unsigned 128-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, truncate to 64-bit and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  if (i &lt; 1) {\n    if (imm == 0) {\n      dst.dword[i] = (u64)(u128)b.qword[i];\n    } else {\n      dst.dword[i] = (u64)(((u128)b.qword[i] &gt;&gt; imm) +\n                           (((u128)b.qword[i] &gt;&gt; (imm - 1)) &amp; 0x1));\n    }\n  } else {\n    if (imm == 0) {\n      dst.dword[i] = (u64)(u128)a.qword[i - 1];\n    } else {\n      dst.dword[i] = (u64)(((u128)a.qword[i - 1] &gt;&gt; imm) +\n                           (((u128)a.qword[i - 1] &gt;&gt; (imm - 1)) &amp; 0x1));\n    }\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsrlrni_h_w (__m128i a, __m128i b, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsrlrni_h_w (__m128i a, __m128i b, imm0_31 imm)\n#include <lsxintrin.h>\nInstruction: vsrlrni.h.w vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nLogical right shift (with rounding) the unsigned 32-bit elements in `a` and `b` by `imm`, truncate to 16-bit and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  if (i < 4) {\n    if (imm == 0) {\n      dst.half[i] = (u16)(u32)b.word[i];\n    } else {\n      dst.half[i] = (u16)(((u32)b.word[i] >> imm) +\n                          (((u32)b.word[i] >> (imm - 1)) & 0x1));\n    }\n  } else {\n    if (imm == 0) {\n      dst.half[i] = (u16)(u32)a.word[i - 4];\n    } else {\n      dst.half[i] = (u16)(((u32)a.word[i - 4] >> imm) +\n                          (((u32)a.word[i - 4] >> (imm - 1)) & 0x1));\n    }\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsrlrni_h_w (__m128i a, __m128i b, imm0_31 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsrlrni.h.w vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift (with rounding) the unsigned 32-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, truncate to 16-bit and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  if (i &lt; 4) {\n    if (imm == 0) {\n      dst.half[i] = (u16)(u32)b.word[i];\n    } else {\n      dst.half[i] = (u16)(((u32)b.word[i] &gt;&gt; imm) +\n                          (((u32)b.word[i] &gt;&gt; (imm - 1)) &amp; 0x1));\n    }\n  } else {\n    if (imm == 0) {\n      dst.half[i] = (u16)(u32)a.word[i - 4];\n    } else {\n      dst.half[i] = (u16)(((u32)a.word[i - 4] &gt;&gt; imm) +\n                          (((u32)a.word[i - 4] &gt;&gt; (imm - 1)) &amp; 0x1));\n    }\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsrlrni_w_d (__m128i a, __m128i b, imm0_63 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsrlrni_w_d (__m128i a, __m128i b, imm0_63 imm)\n#include <lsxintrin.h>\nInstruction: vsrlrni.w.d vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nLogical right shift (with rounding) the unsigned 64-bit elements in `a` and `b` by `imm`, truncate to 32-bit and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (i < 2) {\n    if (imm == 0) {\n      dst.word[i] = (u32)(u64)b.dword[i];\n    } else {\n      dst.word[i] = (u32)(((u64)b.dword[i] >> imm) +\n                          (((u64)b.dword[i] >> (imm - 1)) & 0x1));\n    }\n  } else {\n    if (imm == 0) {\n      dst.word[i] = (u32)(u64)a.dword[i - 2];\n    } else {\n      dst.word[i] = (u32)(((u64)a.dword[i - 2] >> imm) +\n                          (((u64)a.dword[i - 2] >> (imm - 1)) & 0x1));\n    }\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsrlrni_w_d (__m128i a, __m128i b, imm0_63 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsrlrni.w.d vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift (with rounding) the unsigned 64-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, truncate to 32-bit and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (i &lt; 2) {\n    if (imm == 0) {\n      dst.word[i] = (u32)(u64)b.dword[i];\n    } else {\n      dst.word[i] = (u32)(((u64)b.dword[i] &gt;&gt; imm) +\n                          (((u64)b.dword[i] &gt;&gt; (imm - 1)) &amp; 0x1));\n    }\n  } else {\n    if (imm == 0) {\n      dst.word[i] = (u32)(u64)a.dword[i - 2];\n    } else {\n      dst.word[i] = (u32)(((u64)a.dword[i - 2] &gt;&gt; imm) +\n                          (((u64)a.dword[i - 2] &gt;&gt; (imm - 1)) &amp; 0x1));\n    }\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vssran_b_h (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vssran_b_h (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vssran.b.h vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nArithmetic right shift the signed 16-bit elements in `a` by elements in `b`, clamp to fit in signed 8-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  if (i < 8) {\n    s16 temp = (s16)a.half[i] >> (b.half[i] & 15);\n    dst.byte[i] = clamp<s16>(temp, -128, 127);\n  } else {\n    dst.byte[i] = 0;\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vssran_b_h (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vssran.b.h vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift the signed 16-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in signed 8-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  if (i &lt; 8) {\n    s16 temp = (s16)a.half[i] &gt;&gt; (b.half[i] &amp; 15);\n    dst.byte[i] = clamp&lt;s16&gt;(temp, -128, 127);\n  } else {\n    dst.byte[i] = 0;\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vssran_bu_h (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vssran_bu_h (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vssran.bu.h vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nArithmetic right shift the signed 16-bit elements in `a` by elements in `b`, clamp to fit in unsigned 8-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  if (i < 8) {\n    s16 temp = (s16)a.half[i] >> (b.half[i] & 15);\n    dst.byte[i] = clamp<s16>(temp, 0, 255);\n  } else {\n    dst.byte[i] = 0;\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vssran_bu_h (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vssran.bu.h vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift the signed 16-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in unsigned 8-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  if (i &lt; 8) {\n    s16 temp = (s16)a.half[i] &gt;&gt; (b.half[i] &amp; 15);\n    dst.byte[i] = clamp&lt;s16&gt;(temp, 0, 255);\n  } else {\n    dst.byte[i] = 0;\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vssran_h_w (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vssran_h_w (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vssran.h.w vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nArithmetic right shift the signed 32-bit elements in `a` by elements in `b`, clamp to fit in signed 16-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  if (i < 4) {\n    s32 temp = (s32)a.word[i] >> (b.word[i] & 31);\n    dst.half[i] = clamp<s32>(temp, -32768, 32767);\n  } else {\n    dst.half[i] = 0;\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vssran_h_w (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vssran.h.w vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift the signed 32-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in signed 16-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  if (i &lt; 4) {\n    s32 temp = (s32)a.word[i] &gt;&gt; (b.word[i] &amp; 31);\n    dst.half[i] = clamp&lt;s32&gt;(temp, -32768, 32767);\n  } else {\n    dst.half[i] = 0;\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vssran_hu_w (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vssran_hu_w (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vssran.hu.w vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nArithmetic right shift the signed 32-bit elements in `a` by elements in `b`, clamp to fit in unsigned 16-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  if (i < 4) {\n    s32 temp = (s32)a.word[i] >> (b.word[i] & 31);\n    dst.half[i] = clamp<s32>(temp, 0, 65535);\n  } else {\n    dst.half[i] = 0;\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vssran_hu_w (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vssran.hu.w vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift the signed 32-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in unsigned 16-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  if (i &lt; 4) {\n    s32 temp = (s32)a.word[i] &gt;&gt; (b.word[i] &amp; 31);\n    dst.half[i] = clamp&lt;s32&gt;(temp, 0, 65535);\n  } else {\n    dst.half[i] = 0;\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vssran_w_d (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vssran_w_d (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vssran.w.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nArithmetic right shift the signed 64-bit elements in `a` by elements in `b`, clamp to fit in signed 32-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (i < 2) {\n    s64 temp = (s64)a.dword[i] >> (b.dword[i] & 63);\n    dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);\n  } else {\n    dst.word[i] = 0;\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vssran_w_d (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vssran.w.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift the signed 64-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in signed 32-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (i &lt; 2) {\n    s64 temp = (s64)a.dword[i] &gt;&gt; (b.dword[i] &amp; 63);\n    dst.word[i] = clamp&lt;s64&gt;(temp, -2147483648, 2147483647);\n  } else {\n    dst.word[i] = 0;\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vssran_wu_d (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vssran_wu_d (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vssran.wu.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nArithmetic right shift the signed 64-bit elements in `a` by elements in `b`, clamp to fit in unsigned 32-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (i < 2) {\n    s64 temp = (s64)a.dword[i] >> (b.dword[i] & 63);\n    dst.word[i] = clamp<s64>(temp, 0, 4294967295);\n  } else {\n    dst.word[i] = 0;\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vssran_wu_d (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vssran.wu.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift the signed 64-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in unsigned 32-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (i &lt; 2) {\n    s64 temp = (s64)a.dword[i] &gt;&gt; (b.dword[i] &amp; 63);\n    dst.word[i] = clamp&lt;s64&gt;(temp, 0, 4294967295);\n  } else {\n    dst.word[i] = 0;\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vssrani_b_h (__m128i a, __m128i b, imm0_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vssrani_b_h (__m128i a, __m128i b, imm0_15 imm)\n#include <lsxintrin.h>\nInstruction: vssrani.b.h vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nArithmetic right shift the signed 16-bit elements in `a` and `b` by `imm`, clamp to fit in signed 8-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  if (i < 8) {\n    s16 temp = (s16)b.half[i] >> imm;\n    dst.byte[i] = clamp<s16>(temp, -128, 127);\n  } else {\n    s16 temp = (s16)a.half[i - 8] >> imm;\n    dst.byte[i] = clamp<s16>(temp, -128, 127);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vssrani_b_h (__m128i a, __m128i b, imm0_15 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vssrani.b.h vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift the signed 16-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in signed 8-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  if (i &lt; 8) {\n    s16 temp = (s16)b.half[i] &gt;&gt; imm;\n    dst.byte[i] = clamp&lt;s16&gt;(temp, -128, 127);\n  } else {\n    s16 temp = (s16)a.half[i - 8] &gt;&gt; imm;\n    dst.byte[i] = clamp&lt;s16&gt;(temp, -128, 127);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vssrani_bu_h (__m128i a, __m128i b, imm0_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vssrani_bu_h (__m128i a, __m128i b, imm0_15 imm)\n#include <lsxintrin.h>\nInstruction: vssrani.bu.h vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nArithmetic right shift the signed 16-bit elements in `a` and `b` by `imm`, clamp to fit in unsigned 8-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  if (i < 8) {\n    s16 temp = (s16)b.half[i] >> imm;\n    dst.byte[i] = clamp<s16>(temp, 0, 255);\n  } else {\n    s16 temp = (s16)a.half[i - 8] >> imm;\n    dst.byte[i] = clamp<s16>(temp, 0, 255);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vssrani_bu_h (__m128i a, __m128i b, imm0_15 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vssrani.bu.h vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift the signed 16-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in unsigned 8-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  if (i &lt; 8) {\n    s16 temp = (s16)b.half[i] &gt;&gt; imm;\n    dst.byte[i] = clamp&lt;s16&gt;(temp, 0, 255);\n  } else {\n    s16 temp = (s16)a.half[i - 8] &gt;&gt; imm;\n    dst.byte[i] = clamp&lt;s16&gt;(temp, 0, 255);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vssrani_d_q (__m128i a, __m128i b, imm0_127 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vssrani_d_q (__m128i a, __m128i b, imm0_127 imm)\n#include <lsxintrin.h>\nInstruction: vssrani.d.q vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nArithmetic right shift the signed 128-bit elements in `a` and `b` by `imm`, clamp to fit in signed 64-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  if (i < 1) {\n    s128 temp = (s128)b.qword[i] >> imm;\n    dst.dword[i] = clamp<s128>(temp, -9223372036854775808, 9223372036854775807);\n  } else {\n    s128 temp = (s128)a.qword[i - 1] >> imm;\n    dst.dword[i] = clamp<s128>(temp, -9223372036854775808, 9223372036854775807);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vssrani_d_q (__m128i a, __m128i b, imm0_127 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vssrani.d.q vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift the signed 128-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in signed 64-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  if (i &lt; 1) {\n    s128 temp = (s128)b.qword[i] &gt;&gt; imm;\n    dst.dword[i] = clamp&lt;s128&gt;(temp, -9223372036854775808, 9223372036854775807);\n  } else {\n    s128 temp = (s128)a.qword[i - 1] &gt;&gt; imm;\n    dst.dword[i] = clamp&lt;s128&gt;(temp, -9223372036854775808, 9223372036854775807);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vssrani_du_q (__m128i a, __m128i b, imm0_127 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vssrani_du_q (__m128i a, __m128i b, imm0_127 imm)\n#include <lsxintrin.h>\nInstruction: vssrani.du.q vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nArithmetic right shift the signed 128-bit elements in `a` and `b` by `imm`, clamp to fit in unsigned 64-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  if (i < 1) {\n    s128 temp = (s128)b.qword[i] >> imm;\n    dst.dword[i] = clamp<s128>(temp, 0, 18446744073709551615);\n  } else {\n    s128 temp = (s128)a.qword[i - 1] >> imm;\n    dst.dword[i] = clamp<s128>(temp, 0, 18446744073709551615);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vssrani_du_q (__m128i a, __m128i b, imm0_127 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vssrani.du.q vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift the signed 128-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in unsigned 64-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  if (i &lt; 1) {\n    s128 temp = (s128)b.qword[i] &gt;&gt; imm;\n    dst.dword[i] = clamp&lt;s128&gt;(temp, 0, 18446744073709551615);\n  } else {\n    s128 temp = (s128)a.qword[i - 1] &gt;&gt; imm;\n    dst.dword[i] = clamp&lt;s128&gt;(temp, 0, 18446744073709551615);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vssrani_h_w (__m128i a, __m128i b, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vssrani_h_w (__m128i a, __m128i b, imm0_31 imm)\n#include <lsxintrin.h>\nInstruction: vssrani.h.w vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nArithmetic right shift the signed 32-bit elements in `a` and `b` by `imm`, clamp to fit in signed 16-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  if (i < 4) {\n    s32 temp = (s32)b.word[i] >> imm;\n    dst.half[i] = clamp<s32>(temp, -32768, 32767);\n  } else {\n    s32 temp = (s32)a.word[i - 4] >> imm;\n    dst.half[i] = clamp<s32>(temp, -32768, 32767);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vssrani_h_w (__m128i a, __m128i b, imm0_31 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vssrani.h.w vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift the signed 32-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in signed 16-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  if (i &lt; 4) {\n    s32 temp = (s32)b.word[i] &gt;&gt; imm;\n    dst.half[i] = clamp&lt;s32&gt;(temp, -32768, 32767);\n  } else {\n    s32 temp = (s32)a.word[i - 4] &gt;&gt; imm;\n    dst.half[i] = clamp&lt;s32&gt;(temp, -32768, 32767);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vssrani_hu_w (__m128i a, __m128i b, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vssrani_hu_w (__m128i a, __m128i b, imm0_31 imm)\n#include <lsxintrin.h>\nInstruction: vssrani.hu.w vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nArithmetic right shift the signed 32-bit elements in `a` and `b` by `imm`, clamp to fit in unsigned 16-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  if (i < 4) {\n    s32 temp = (s32)b.word[i] >> imm;\n    dst.half[i] = clamp<s32>(temp, 0, 65535);\n  } else {\n    s32 temp = (s32)a.word[i - 4] >> imm;\n    dst.half[i] = clamp<s32>(temp, 0, 65535);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vssrani_hu_w (__m128i a, __m128i b, imm0_31 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vssrani.hu.w vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift the signed 32-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in unsigned 16-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  if (i &lt; 4) {\n    s32 temp = (s32)b.word[i] &gt;&gt; imm;\n    dst.half[i] = clamp&lt;s32&gt;(temp, 0, 65535);\n  } else {\n    s32 temp = (s32)a.word[i - 4] &gt;&gt; imm;\n    dst.half[i] = clamp&lt;s32&gt;(temp, 0, 65535);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vssrani_w_d (__m128i a, __m128i b, imm0_63 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vssrani_w_d (__m128i a, __m128i b, imm0_63 imm)\n#include <lsxintrin.h>\nInstruction: vssrani.w.d vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nArithmetic right shift the signed 64-bit elements in `a` and `b` by `imm`, clamp to fit in signed 32-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (i < 2) {\n    s64 temp = (s64)b.dword[i] >> imm;\n    dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);\n  } else {\n    s64 temp = (s64)a.dword[i - 2] >> imm;\n    dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vssrani_w_d (__m128i a, __m128i b, imm0_63 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vssrani.w.d vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift the signed 64-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in signed 32-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (i &lt; 2) {\n    s64 temp = (s64)b.dword[i] &gt;&gt; imm;\n    dst.word[i] = clamp&lt;s64&gt;(temp, -2147483648, 2147483647);\n  } else {\n    s64 temp = (s64)a.dword[i - 2] &gt;&gt; imm;\n    dst.word[i] = clamp&lt;s64&gt;(temp, -2147483648, 2147483647);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vssrani_wu_d (__m128i a, __m128i b, imm0_63 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vssrani_wu_d (__m128i a, __m128i b, imm0_63 imm)\n#include <lsxintrin.h>\nInstruction: vssrani.wu.d vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nArithmetic right shift the signed 64-bit elements in `a` and `b` by `imm`, clamp to fit in unsigned 32-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (i < 2) {\n    s64 temp = (s64)b.dword[i] >> imm;\n    dst.word[i] = clamp<s64>(temp, 0, 4294967295);\n  } else {\n    s64 temp = (s64)a.dword[i - 2] >> imm;\n    dst.word[i] = clamp<s64>(temp, 0, 4294967295);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vssrani_wu_d (__m128i a, __m128i b, imm0_63 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vssrani.wu.d vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift the signed 64-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in unsigned 32-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (i &lt; 2) {\n    s64 temp = (s64)b.dword[i] &gt;&gt; imm;\n    dst.word[i] = clamp&lt;s64&gt;(temp, 0, 4294967295);\n  } else {\n    s64 temp = (s64)a.dword[i - 2] &gt;&gt; imm;\n    dst.word[i] = clamp&lt;s64&gt;(temp, 0, 4294967295);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vssrarn_b_h (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vssrarn_b_h (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vssrarn.b.h vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nArithmetic right shift (with rounding) the signed 16-bit elements in `a` by elements in `b`, clamp to fit in signed 8-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  if (i < 8) {\n    s16 temp;\n    if ((b.half[i] & 15) == 0) {\n      temp = (s16)a.half[i];\n    } else {\n      temp = ((s16)a.half[i] >> (b.half[i] & 15)) +\n             (((s16)a.half[i] >> ((b.half[i] & 15) - 1)) & 1);\n    }\n    dst.byte[i] = clamp<s16>(temp, -128, 127);\n  } else {\n    dst.byte[i] = 0;\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vssrarn_b_h (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vssrarn.b.h vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift (with rounding) the signed 16-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in signed 8-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  if (i &lt; 8) {\n    s16 temp;\n    if ((b.half[i] &amp; 15) == 0) {\n      temp = (s16)a.half[i];\n    } else {\n      temp = ((s16)a.half[i] &gt;&gt; (b.half[i] &amp; 15)) +\n             (((s16)a.half[i] &gt;&gt; ((b.half[i] &amp; 15) - 1)) &amp; 1);\n    }\n    dst.byte[i] = clamp&lt;s16&gt;(temp, -128, 127);\n  } else {\n    dst.byte[i] = 0;\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vssrarn_bu_h (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vssrarn_bu_h (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vssrarn.bu.h vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nArithmetic right shift (with rounding) the signed 16-bit elements in `a` by elements in `b`, clamp to fit in unsigned 8-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  if (i < 8) {\n    s16 temp;\n    if ((b.half[i] & 15) == 0) {\n      temp = (s16)a.half[i];\n    } else {\n      temp = ((s16)a.half[i] >> (b.half[i] & 15)) +\n             (((s16)a.half[i] >> ((b.half[i] & 15) - 1)) & 1);\n    }\n    dst.byte[i] = clamp<s16>(temp, 0, 255);\n  } else {\n    dst.byte[i] = 0;\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vssrarn_bu_h (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vssrarn.bu.h vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift (with rounding) the signed 16-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in unsigned 8-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  if (i &lt; 8) {\n    s16 temp;\n    if ((b.half[i] &amp; 15) == 0) {\n      temp = (s16)a.half[i];\n    } else {\n      temp = ((s16)a.half[i] &gt;&gt; (b.half[i] &amp; 15)) +\n             (((s16)a.half[i] &gt;&gt; ((b.half[i] &amp; 15) - 1)) &amp; 1);\n    }\n    dst.byte[i] = clamp&lt;s16&gt;(temp, 0, 255);\n  } else {\n    dst.byte[i] = 0;\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vssrarn_h_w (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vssrarn_h_w (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vssrarn.h.w vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nArithmetic right shift (with rounding) the signed 32-bit elements in `a` by elements in `b`, clamp to fit in signed 16-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  if (i < 4) {\n    s32 temp;\n    if ((b.word[i] & 31) == 0) {\n      temp = (s32)a.word[i];\n    } else {\n      temp = ((s32)a.word[i] >> (b.word[i] & 31)) +\n             (((s32)a.word[i] >> ((b.word[i] & 31) - 1)) & 1);\n    }\n    dst.half[i] = clamp<s32>(temp, -32768, 32767);\n  } else {\n    dst.half[i] = 0;\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vssrarn_h_w (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vssrarn.h.w vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift (with rounding) the signed 32-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in signed 16-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  if (i &lt; 4) {\n    s32 temp;\n    if ((b.word[i] &amp; 31) == 0) {\n      temp = (s32)a.word[i];\n    } else {\n      temp = ((s32)a.word[i] &gt;&gt; (b.word[i] &amp; 31)) +\n             (((s32)a.word[i] &gt;&gt; ((b.word[i] &amp; 31) - 1)) &amp; 1);\n    }\n    dst.half[i] = clamp&lt;s32&gt;(temp, -32768, 32767);\n  } else {\n    dst.half[i] = 0;\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vssrarn_hu_w (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vssrarn_hu_w (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vssrarn.hu.w vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nArithmetic right shift (with rounding) the signed 32-bit elements in `a` by elements in `b`, clamp to fit in unsigned 16-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  if (i < 4) {\n    s32 temp;\n    if ((b.word[i] & 31) == 0) {\n      temp = (s32)a.word[i];\n    } else {\n      temp = ((s32)a.word[i] >> (b.word[i] & 31)) +\n             (((s32)a.word[i] >> ((b.word[i] & 31) - 1)) & 1);\n    }\n    dst.half[i] = clamp<s32>(temp, 0, 65535);\n  } else {\n    dst.half[i] = 0;\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vssrarn_hu_w (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vssrarn.hu.w vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift (with rounding) the signed 32-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in unsigned 16-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  if (i &lt; 4) {\n    s32 temp;\n    if ((b.word[i] &amp; 31) == 0) {\n      temp = (s32)a.word[i];\n    } else {\n      temp = ((s32)a.word[i] &gt;&gt; (b.word[i] &amp; 31)) +\n             (((s32)a.word[i] &gt;&gt; ((b.word[i] &amp; 31) - 1)) &amp; 1);\n    }\n    dst.half[i] = clamp&lt;s32&gt;(temp, 0, 65535);\n  } else {\n    dst.half[i] = 0;\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vssrarn_w_d (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vssrarn_w_d (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vssrarn.w.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nArithmetic right shift (with rounding) the signed 64-bit elements in `a` by elements in `b`, clamp to fit in signed 32-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (i < 2) {\n    s64 temp;\n    if ((b.dword[i] & 63) == 0) {\n      temp = (s64)a.dword[i];\n    } else {\n      temp = ((s64)a.dword[i] >> (b.dword[i] & 63)) +\n             (((s64)a.dword[i] >> ((b.dword[i] & 63) - 1)) & 1);\n    }\n    dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);\n  } else {\n    dst.word[i] = 0;\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vssrarn_w_d (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vssrarn.w.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift (with rounding) the signed 64-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in signed 32-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (i &lt; 2) {\n    s64 temp;\n    if ((b.dword[i] &amp; 63) == 0) {\n      temp = (s64)a.dword[i];\n    } else {\n      temp = ((s64)a.dword[i] &gt;&gt; (b.dword[i] &amp; 63)) +\n             (((s64)a.dword[i] &gt;&gt; ((b.dword[i] &amp; 63) - 1)) &amp; 1);\n    }\n    dst.word[i] = clamp&lt;s64&gt;(temp, -2147483648, 2147483647);\n  } else {\n    dst.word[i] = 0;\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vssrarn_wu_d (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vssrarn_wu_d (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vssrarn.wu.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nArithmetic right shift (with rounding) the signed 64-bit elements in `a` by elements in `b`, clamp to fit in unsigned 32-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (i < 2) {\n    s64 temp;\n    if ((b.dword[i] & 63) == 0) {\n      temp = (s64)a.dword[i];\n    } else {\n      temp = ((s64)a.dword[i] >> (b.dword[i] & 63)) +\n             (((s64)a.dword[i] >> ((b.dword[i] & 63) - 1)) & 1);\n    }\n    dst.word[i] = clamp<s64>(temp, 0, 4294967295);\n  } else {\n    dst.word[i] = 0;\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vssrarn_wu_d (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vssrarn.wu.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift (with rounding) the signed 64-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in unsigned 32-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (i &lt; 2) {\n    s64 temp;\n    if ((b.dword[i] &amp; 63) == 0) {\n      temp = (s64)a.dword[i];\n    } else {\n      temp = ((s64)a.dword[i] &gt;&gt; (b.dword[i] &amp; 63)) +\n             (((s64)a.dword[i] &gt;&gt; ((b.dword[i] &amp; 63) - 1)) &amp; 1);\n    }\n    dst.word[i] = clamp&lt;s64&gt;(temp, 0, 4294967295);\n  } else {\n    dst.word[i] = 0;\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vssrarni_b_h (__m128i a, __m128i b, imm0_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vssrarni_b_h (__m128i a, __m128i b, imm0_15 imm)\n#include <lsxintrin.h>\nInstruction: vssrarni.b.h vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nArithmetic right shift (with rounding) the signed 16-bit elements in `a` and `b` by `imm`, clamp to fit in signed 8-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  if (i < 8) {\n    s16 temp;\n    if (imm == 0) {\n      temp = (s16)b.half[i];\n    } else {\n      temp = ((s16)b.half[i] >> imm) + (((s16)b.half[i] >> (imm - 1)) & 1);\n    }\n    dst.byte[i] = clamp<s16>(temp, -128, 127);\n  } else {\n    s16 temp;\n    if (imm == 0) {\n      temp = (s16)a.half[i - 8];\n    } else {\n      temp =\n          ((s16)a.half[i - 8] >> imm) + (((s16)a.half[i - 8] >> (imm - 1)) & 1);\n    }\n    dst.byte[i] = clamp<s16>(temp, -128, 127);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vssrarni_b_h (__m128i a, __m128i b, imm0_15 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vssrarni.b.h vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift (with rounding) the signed 16-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in signed 8-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  if (i &lt; 8) {\n    s16 temp;\n    if (imm == 0) {\n      temp = (s16)b.half[i];\n    } else {\n      temp = ((s16)b.half[i] &gt;&gt; imm) + (((s16)b.half[i] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.byte[i] = clamp&lt;s16&gt;(temp, -128, 127);\n  } else {\n    s16 temp;\n    if (imm == 0) {\n      temp = (s16)a.half[i - 8];\n    } else {\n      temp =\n          ((s16)a.half[i - 8] &gt;&gt; imm) + (((s16)a.half[i - 8] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.byte[i] = clamp&lt;s16&gt;(temp, -128, 127);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vssrarni_bu_h (__m128i a, __m128i b, imm0_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vssrarni_bu_h (__m128i a, __m128i b, imm0_15 imm)\n#include <lsxintrin.h>\nInstruction: vssrarni.bu.h vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nArithmetic right shift (with rounding) the signed 16-bit elements in `a` and `b` by `imm`, clamp to fit in unsigned 8-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  if (i < 8) {\n    s16 temp;\n    if (imm == 0) {\n      temp = (s16)b.half[i];\n    } else {\n      temp = ((s16)b.half[i] >> imm) + (((s16)b.half[i] >> (imm - 1)) & 1);\n    }\n    dst.byte[i] = clamp<s16>(temp, 0, 255);\n  } else {\n    s16 temp;\n    if (imm == 0) {\n      temp = (s16)a.half[i - 8];\n    } else {\n      temp =\n          ((s16)a.half[i - 8] >> imm) + (((s16)a.half[i - 8] >> (imm - 1)) & 1);\n    }\n    dst.byte[i] = clamp<s16>(temp, 0, 255);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vssrarni_bu_h (__m128i a, __m128i b, imm0_15 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vssrarni.bu.h vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift (with rounding) the signed 16-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in unsigned 8-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  if (i &lt; 8) {\n    s16 temp;\n    if (imm == 0) {\n      temp = (s16)b.half[i];\n    } else {\n      temp = ((s16)b.half[i] &gt;&gt; imm) + (((s16)b.half[i] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.byte[i] = clamp&lt;s16&gt;(temp, 0, 255);\n  } else {\n    s16 temp;\n    if (imm == 0) {\n      temp = (s16)a.half[i - 8];\n    } else {\n      temp =\n          ((s16)a.half[i - 8] &gt;&gt; imm) + (((s16)a.half[i - 8] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.byte[i] = clamp&lt;s16&gt;(temp, 0, 255);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vssrarni_d_q (__m128i a, __m128i b, imm0_127 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vssrarni_d_q (__m128i a, __m128i b, imm0_127 imm)\n#include <lsxintrin.h>\nInstruction: vssrarni.d.q vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nArithmetic right shift (with rounding) the signed 128-bit elements in `a` and `b` by `imm`, clamp to fit in signed 64-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  if (i < 1) {\n    s128 temp;\n    if (imm == 0) {\n      temp = (s128)b.qword[i];\n    } else {\n      temp = ((s128)b.qword[i] >> imm) + (((s128)b.qword[i] >> (imm - 1)) & 1);\n    }\n    dst.dword[i] = clamp<s128>(temp, -9223372036854775808, 9223372036854775807);\n  } else {\n    s128 temp;\n    if (imm == 0) {\n      temp = (s128)a.qword[i - 1];\n    } else {\n      temp = ((s128)a.qword[i - 1] >> imm) +\n             (((s128)a.qword[i - 1] >> (imm - 1)) & 1);\n    }\n    dst.dword[i] = clamp<s128>(temp, -9223372036854775808, 9223372036854775807);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vssrarni_d_q (__m128i a, __m128i b, imm0_127 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vssrarni.d.q vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift (with rounding) the signed 128-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in signed 64-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  if (i &lt; 1) {\n    s128 temp;\n    if (imm == 0) {\n      temp = (s128)b.qword[i];\n    } else {\n      temp = ((s128)b.qword[i] &gt;&gt; imm) + (((s128)b.qword[i] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.dword[i] = clamp&lt;s128&gt;(temp, -9223372036854775808, 9223372036854775807);\n  } else {\n    s128 temp;\n    if (imm == 0) {\n      temp = (s128)a.qword[i - 1];\n    } else {\n      temp = ((s128)a.qword[i - 1] &gt;&gt; imm) +\n             (((s128)a.qword[i - 1] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.dword[i] = clamp&lt;s128&gt;(temp, -9223372036854775808, 9223372036854775807);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vssrarni_du_q (__m128i a, __m128i b, imm0_127 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vssrarni_du_q (__m128i a, __m128i b, imm0_127 imm)\n#include <lsxintrin.h>\nInstruction: vssrarni.du.q vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nArithmetic right shift (with rounding) the signed 128-bit elements in `a` and `b` by `imm`, clamp to fit in unsigned 64-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  if (i < 1) {\n    s128 temp;\n    if (imm == 0) {\n      temp = (s128)b.qword[i];\n    } else {\n      temp = ((s128)b.qword[i] >> imm) + (((s128)b.qword[i] >> (imm - 1)) & 1);\n    }\n    dst.dword[i] = clamp<s128>(temp, 0, 18446744073709551615);\n  } else {\n    s128 temp;\n    if (imm == 0) {\n      temp = (s128)a.qword[i - 1];\n    } else {\n      temp = ((s128)a.qword[i - 1] >> imm) +\n             (((s128)a.qword[i - 1] >> (imm - 1)) & 1);\n    }\n    dst.dword[i] = clamp<s128>(temp, 0, 18446744073709551615);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vssrarni_du_q (__m128i a, __m128i b, imm0_127 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vssrarni.du.q vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift (with rounding) the signed 128-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in unsigned 64-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  if (i &lt; 1) {\n    s128 temp;\n    if (imm == 0) {\n      temp = (s128)b.qword[i];\n    } else {\n      temp = ((s128)b.qword[i] &gt;&gt; imm) + (((s128)b.qword[i] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.dword[i] = clamp&lt;s128&gt;(temp, 0, 18446744073709551615);\n  } else {\n    s128 temp;\n    if (imm == 0) {\n      temp = (s128)a.qword[i - 1];\n    } else {\n      temp = ((s128)a.qword[i - 1] &gt;&gt; imm) +\n             (((s128)a.qword[i - 1] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.dword[i] = clamp&lt;s128&gt;(temp, 0, 18446744073709551615);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vssrarni_h_w (__m128i a, __m128i b, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vssrarni_h_w (__m128i a, __m128i b, imm0_31 imm)\n#include <lsxintrin.h>\nInstruction: vssrarni.h.w vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nArithmetic right shift (with rounding) the signed 32-bit elements in `a` and `b` by `imm`, clamp to fit in signed 16-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  if (i < 4) {\n    s32 temp;\n    if (imm == 0) {\n      temp = (s32)b.word[i];\n    } else {\n      temp = ((s32)b.word[i] >> imm) + (((s32)b.word[i] >> (imm - 1)) & 1);\n    }\n    dst.half[i] = clamp<s32>(temp, -32768, 32767);\n  } else {\n    s32 temp;\n    if (imm == 0) {\n      temp = (s32)a.word[i - 4];\n    } else {\n      temp =\n          ((s32)a.word[i - 4] >> imm) + (((s32)a.word[i - 4] >> (imm - 1)) & 1);\n    }\n    dst.half[i] = clamp<s32>(temp, -32768, 32767);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vssrarni_h_w (__m128i a, __m128i b, imm0_31 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vssrarni.h.w vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift (with rounding) the signed 32-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in signed 16-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  if (i &lt; 4) {\n    s32 temp;\n    if (imm == 0) {\n      temp = (s32)b.word[i];\n    } else {\n      temp = ((s32)b.word[i] &gt;&gt; imm) + (((s32)b.word[i] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.half[i] = clamp&lt;s32&gt;(temp, -32768, 32767);\n  } else {\n    s32 temp;\n    if (imm == 0) {\n      temp = (s32)a.word[i - 4];\n    } else {\n      temp =\n          ((s32)a.word[i - 4] &gt;&gt; imm) + (((s32)a.word[i - 4] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.half[i] = clamp&lt;s32&gt;(temp, -32768, 32767);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vssrarni_hu_w (__m128i a, __m128i b, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vssrarni_hu_w (__m128i a, __m128i b, imm0_31 imm)\n#include <lsxintrin.h>\nInstruction: vssrarni.hu.w vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nArithmetic right shift (with rounding) the signed 32-bit elements in `a` and `b` by `imm`, clamp to fit in unsigned 16-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  if (i < 4) {\n    s32 temp;\n    if (imm == 0) {\n      temp = (s32)b.word[i];\n    } else {\n      temp = ((s32)b.word[i] >> imm) + (((s32)b.word[i] >> (imm - 1)) & 1);\n    }\n    dst.half[i] = clamp<s32>(temp, 0, 65535);\n  } else {\n    s32 temp;\n    if (imm == 0) {\n      temp = (s32)a.word[i - 4];\n    } else {\n      temp =\n          ((s32)a.word[i - 4] >> imm) + (((s32)a.word[i - 4] >> (imm - 1)) & 1);\n    }\n    dst.half[i] = clamp<s32>(temp, 0, 65535);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vssrarni_hu_w (__m128i a, __m128i b, imm0_31 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vssrarni.hu.w vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift (with rounding) the signed 32-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in unsigned 16-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  if (i &lt; 4) {\n    s32 temp;\n    if (imm == 0) {\n      temp = (s32)b.word[i];\n    } else {\n      temp = ((s32)b.word[i] &gt;&gt; imm) + (((s32)b.word[i] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.half[i] = clamp&lt;s32&gt;(temp, 0, 65535);\n  } else {\n    s32 temp;\n    if (imm == 0) {\n      temp = (s32)a.word[i - 4];\n    } else {\n      temp =\n          ((s32)a.word[i - 4] &gt;&gt; imm) + (((s32)a.word[i - 4] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.half[i] = clamp&lt;s32&gt;(temp, 0, 65535);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vssrarni_w_d (__m128i a, __m128i b, imm0_63 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vssrarni_w_d (__m128i a, __m128i b, imm0_63 imm)\n#include <lsxintrin.h>\nInstruction: vssrarni.w.d vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nArithmetic right shift (with rounding) the signed 64-bit elements in `a` and `b` by `imm`, clamp to fit in signed 32-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (i < 2) {\n    s64 temp;\n    if (imm == 0) {\n      temp = (s64)b.dword[i];\n    } else {\n      temp = ((s64)b.dword[i] >> imm) + (((s64)b.dword[i] >> (imm - 1)) & 1);\n    }\n    dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);\n  } else {\n    s64 temp;\n    if (imm == 0) {\n      temp = (s64)a.dword[i - 2];\n    } else {\n      temp = ((s64)a.dword[i - 2] >> imm) +\n             (((s64)a.dword[i - 2] >> (imm - 1)) & 1);\n    }\n    dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vssrarni_w_d (__m128i a, __m128i b, imm0_63 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vssrarni.w.d vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift (with rounding) the signed 64-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in signed 32-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (i &lt; 2) {\n    s64 temp;\n    if (imm == 0) {\n      temp = (s64)b.dword[i];\n    } else {\n      temp = ((s64)b.dword[i] &gt;&gt; imm) + (((s64)b.dword[i] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.word[i] = clamp&lt;s64&gt;(temp, -2147483648, 2147483647);\n  } else {\n    s64 temp;\n    if (imm == 0) {\n      temp = (s64)a.dword[i - 2];\n    } else {\n      temp = ((s64)a.dword[i - 2] &gt;&gt; imm) +\n             (((s64)a.dword[i - 2] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.word[i] = clamp&lt;s64&gt;(temp, -2147483648, 2147483647);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vssrarni_wu_d (__m128i a, __m128i b, imm0_63 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vssrarni_wu_d (__m128i a, __m128i b, imm0_63 imm)\n#include <lsxintrin.h>\nInstruction: vssrarni.wu.d vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nArithmetic right shift (with rounding) the signed 64-bit elements in `a` and `b` by `imm`, clamp to fit in unsigned 32-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (i < 2) {\n    s64 temp;\n    if (imm == 0) {\n      temp = (s64)b.dword[i];\n    } else {\n      temp = ((s64)b.dword[i] >> imm) + (((s64)b.dword[i] >> (imm - 1)) & 1);\n    }\n    dst.word[i] = clamp<s64>(temp, 0, 4294967295);\n  } else {\n    s64 temp;\n    if (imm == 0) {\n      temp = (s64)a.dword[i - 2];\n    } else {\n      temp = ((s64)a.dword[i - 2] >> imm) +\n             (((s64)a.dword[i - 2] >> (imm - 1)) & 1);\n    }\n    dst.word[i] = clamp<s64>(temp, 0, 4294967295);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vssrarni_wu_d (__m128i a, __m128i b, imm0_63 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vssrarni.wu.d vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift (with rounding) the signed 64-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in unsigned 32-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (i &lt; 2) {\n    s64 temp;\n    if (imm == 0) {\n      temp = (s64)b.dword[i];\n    } else {\n      temp = ((s64)b.dword[i] &gt;&gt; imm) + (((s64)b.dword[i] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.word[i] = clamp&lt;s64&gt;(temp, 0, 4294967295);\n  } else {\n    s64 temp;\n    if (imm == 0) {\n      temp = (s64)a.dword[i - 2];\n    } else {\n      temp = ((s64)a.dword[i - 2] &gt;&gt; imm) +\n             (((s64)a.dword[i - 2] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.word[i] = clamp&lt;s64&gt;(temp, 0, 4294967295);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vssrln_b_h (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vssrln_b_h (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vssrln.b.h vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nLogical right shift the unsigned 16-bit elements in `a` by elements in `b`, clamp to fit in signed 8-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  if (i < 8) {\n    u16 temp = (u16)a.half[i] >> (b.half[i] & 15);\n    dst.byte[i] = clamp<u16>(temp, 0, 127);\n  } else {\n    dst.byte[i] = 0;\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vssrln_b_h (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vssrln.b.h vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift the unsigned 16-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in signed 8-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  if (i &lt; 8) {\n    u16 temp = (u16)a.half[i] &gt;&gt; (b.half[i] &amp; 15);\n    dst.byte[i] = clamp&lt;u16&gt;(temp, 0, 127);\n  } else {\n    dst.byte[i] = 0;\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vssrln_bu_h (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vssrln_bu_h (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vssrln.bu.h vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nLogical right shift the unsigned 16-bit elements in `a` by elements in `b`, clamp to fit in unsigned 8-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  if (i < 8) {\n    u16 temp = (u16)a.half[i] >> (b.half[i] & 15);\n    dst.byte[i] = clamp<u16>(temp, 0, 255);\n  } else {\n    dst.byte[i] = 0;\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vssrln_bu_h (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vssrln.bu.h vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift the unsigned 16-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in unsigned 8-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  if (i &lt; 8) {\n    u16 temp = (u16)a.half[i] &gt;&gt; (b.half[i] &amp; 15);\n    dst.byte[i] = clamp&lt;u16&gt;(temp, 0, 255);\n  } else {\n    dst.byte[i] = 0;\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vssrln_h_w (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vssrln_h_w (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vssrln.h.w vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nLogical right shift the unsigned 32-bit elements in `a` by elements in `b`, clamp to fit in signed 16-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  if (i < 4) {\n    u32 temp = (u32)a.word[i] >> (b.word[i] & 31);\n    dst.half[i] = clamp<u32>(temp, 0, 32767);\n  } else {\n    dst.half[i] = 0;\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vssrln_h_w (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vssrln.h.w vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift the unsigned 32-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in signed 16-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  if (i &lt; 4) {\n    u32 temp = (u32)a.word[i] &gt;&gt; (b.word[i] &amp; 31);\n    dst.half[i] = clamp&lt;u32&gt;(temp, 0, 32767);\n  } else {\n    dst.half[i] = 0;\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vssrln_hu_w (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vssrln_hu_w (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vssrln.hu.w vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nLogical right shift the unsigned 32-bit elements in `a` by elements in `b`, clamp to fit in unsigned 16-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  if (i < 4) {\n    u32 temp = (u32)a.word[i] >> (b.word[i] & 31);\n    dst.half[i] = clamp<u32>(temp, 0, 65535);\n  } else {\n    dst.half[i] = 0;\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vssrln_hu_w (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vssrln.hu.w vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift the unsigned 32-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in unsigned 16-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  if (i &lt; 4) {\n    u32 temp = (u32)a.word[i] &gt;&gt; (b.word[i] &amp; 31);\n    dst.half[i] = clamp&lt;u32&gt;(temp, 0, 65535);\n  } else {\n    dst.half[i] = 0;\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vssrln_w_d (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vssrln_w_d (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vssrln.w.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nLogical right shift the unsigned 64-bit elements in `a` by elements in `b`, clamp to fit in signed 32-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (i < 2) {\n    u64 temp = (u64)a.dword[i] >> (b.dword[i] & 63);\n    dst.word[i] = clamp<u64>(temp, 0, 2147483647);\n  } else {\n    dst.word[i] = 0;\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vssrln_w_d (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vssrln.w.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift the unsigned 64-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in signed 32-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (i &lt; 2) {\n    u64 temp = (u64)a.dword[i] &gt;&gt; (b.dword[i] &amp; 63);\n    dst.word[i] = clamp&lt;u64&gt;(temp, 0, 2147483647);\n  } else {\n    dst.word[i] = 0;\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vssrln_wu_d (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vssrln_wu_d (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vssrln.wu.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nLogical right shift the unsigned 64-bit elements in `a` by elements in `b`, clamp to fit in unsigned 32-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (i < 2) {\n    u64 temp = (u64)a.dword[i] >> (b.dword[i] & 63);\n    dst.word[i] = clamp<u64>(temp, 0, 4294967295);\n  } else {\n    dst.word[i] = 0;\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vssrln_wu_d (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vssrln.wu.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift the unsigned 64-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in unsigned 32-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (i &lt; 2) {\n    u64 temp = (u64)a.dword[i] &gt;&gt; (b.dword[i] &amp; 63);\n    dst.word[i] = clamp&lt;u64&gt;(temp, 0, 4294967295);\n  } else {\n    dst.word[i] = 0;\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vssrlni_b_h (__m128i a, __m128i b, imm0_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vssrlni_b_h (__m128i a, __m128i b, imm0_15 imm)\n#include <lsxintrin.h>\nInstruction: vssrlni.b.h vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nLogical right shift the unsigned 16-bit elements in `a` and `b` by `imm`, clamp to fit in signed 8-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  if (i < 8) {\n    u16 temp = (u16)b.half[i] >> imm;\n    dst.byte[i] = clamp<u16>(temp, 0, 127);\n  } else {\n    u16 temp = (u16)a.half[i - 8] >> imm;\n    dst.byte[i] = clamp<u16>(temp, 0, 127);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vssrlni_b_h (__m128i a, __m128i b, imm0_15 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vssrlni.b.h vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift the unsigned 16-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in signed 8-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  if (i &lt; 8) {\n    u16 temp = (u16)b.half[i] &gt;&gt; imm;\n    dst.byte[i] = clamp&lt;u16&gt;(temp, 0, 127);\n  } else {\n    u16 temp = (u16)a.half[i - 8] &gt;&gt; imm;\n    dst.byte[i] = clamp&lt;u16&gt;(temp, 0, 127);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vssrlni_bu_h (__m128i a, __m128i b, imm0_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vssrlni_bu_h (__m128i a, __m128i b, imm0_15 imm)\n#include <lsxintrin.h>\nInstruction: vssrlni.bu.h vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nLogical right shift the unsigned 16-bit elements in `a` and `b` by `imm`, clamp to fit in unsigned 8-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  if (i < 8) {\n    u16 temp = (u16)b.half[i] >> imm;\n    dst.byte[i] = clamp<u16>(temp, 0, 255);\n  } else {\n    u16 temp = (u16)a.half[i - 8] >> imm;\n    dst.byte[i] = clamp<u16>(temp, 0, 255);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vssrlni_bu_h (__m128i a, __m128i b, imm0_15 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vssrlni.bu.h vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift the unsigned 16-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in unsigned 8-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  if (i &lt; 8) {\n    u16 temp = (u16)b.half[i] &gt;&gt; imm;\n    dst.byte[i] = clamp&lt;u16&gt;(temp, 0, 255);\n  } else {\n    u16 temp = (u16)a.half[i - 8] &gt;&gt; imm;\n    dst.byte[i] = clamp&lt;u16&gt;(temp, 0, 255);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vssrlni_d_q (__m128i a, __m128i b, imm0_127 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vssrlni_d_q (__m128i a, __m128i b, imm0_127 imm)\n#include <lsxintrin.h>\nInstruction: vssrlni.d.q vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nLogical right shift the unsigned 128-bit elements in `a` and `b` by `imm`, clamp to fit in signed 64-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  if (i < 1) {\n    u128 temp = (u128)b.qword[i] >> imm;\n    dst.dword[i] = clamp<u128>(temp, 0, 9223372036854775807);\n  } else {\n    u128 temp = (u128)a.qword[i - 1] >> imm;\n    dst.dword[i] = clamp<u128>(temp, 0, 9223372036854775807);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vssrlni_d_q (__m128i a, __m128i b, imm0_127 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vssrlni.d.q vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift the unsigned 128-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in signed 64-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  if (i &lt; 1) {\n    u128 temp = (u128)b.qword[i] &gt;&gt; imm;\n    dst.dword[i] = clamp&lt;u128&gt;(temp, 0, 9223372036854775807);\n  } else {\n    u128 temp = (u128)a.qword[i - 1] &gt;&gt; imm;\n    dst.dword[i] = clamp&lt;u128&gt;(temp, 0, 9223372036854775807);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vssrlni_du_q (__m128i a, __m128i b, imm0_127 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vssrlni_du_q (__m128i a, __m128i b, imm0_127 imm)\n#include <lsxintrin.h>\nInstruction: vssrlni.du.q vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nLogical right shift the unsigned 128-bit elements in `a` and `b` by `imm`, clamp to fit in unsigned 64-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  if (i < 1) {\n    u128 temp = (u128)b.qword[i] >> imm;\n    dst.dword[i] = clamp<u128>(temp, 0, 18446744073709551615);\n  } else {\n    u128 temp = (u128)a.qword[i - 1] >> imm;\n    dst.dword[i] = clamp<u128>(temp, 0, 18446744073709551615);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vssrlni_du_q (__m128i a, __m128i b, imm0_127 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vssrlni.du.q vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift the unsigned 128-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in unsigned 64-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  if (i &lt; 1) {\n    u128 temp = (u128)b.qword[i] &gt;&gt; imm;\n    dst.dword[i] = clamp&lt;u128&gt;(temp, 0, 18446744073709551615);\n  } else {\n    u128 temp = (u128)a.qword[i - 1] &gt;&gt; imm;\n    dst.dword[i] = clamp&lt;u128&gt;(temp, 0, 18446744073709551615);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vssrlni_h_w (__m128i a, __m128i b, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vssrlni_h_w (__m128i a, __m128i b, imm0_31 imm)\n#include <lsxintrin.h>\nInstruction: vssrlni.h.w vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nLogical right shift the unsigned 32-bit elements in `a` and `b` by `imm`, clamp to fit in signed 16-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  if (i < 4) {\n    u32 temp = (u32)b.word[i] >> imm;\n    dst.half[i] = clamp<u32>(temp, 0, 32767);\n  } else {\n    u32 temp = (u32)a.word[i - 4] >> imm;\n    dst.half[i] = clamp<u32>(temp, 0, 32767);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vssrlni_h_w (__m128i a, __m128i b, imm0_31 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vssrlni.h.w vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift the unsigned 32-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in signed 16-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  if (i &lt; 4) {\n    u32 temp = (u32)b.word[i] &gt;&gt; imm;\n    dst.half[i] = clamp&lt;u32&gt;(temp, 0, 32767);\n  } else {\n    u32 temp = (u32)a.word[i - 4] &gt;&gt; imm;\n    dst.half[i] = clamp&lt;u32&gt;(temp, 0, 32767);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vssrlni_hu_w (__m128i a, __m128i b, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vssrlni_hu_w (__m128i a, __m128i b, imm0_31 imm)\n#include <lsxintrin.h>\nInstruction: vssrlni.hu.w vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nLogical right shift the unsigned 32-bit elements in `a` and `b` by `imm`, clamp to fit in unsigned 16-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  if (i < 4) {\n    u32 temp = (u32)b.word[i] >> imm;\n    dst.half[i] = clamp<u32>(temp, 0, 65535);\n  } else {\n    u32 temp = (u32)a.word[i - 4] >> imm;\n    dst.half[i] = clamp<u32>(temp, 0, 65535);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vssrlni_hu_w (__m128i a, __m128i b, imm0_31 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vssrlni.hu.w vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift the unsigned 32-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in unsigned 16-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  if (i &lt; 4) {\n    u32 temp = (u32)b.word[i] &gt;&gt; imm;\n    dst.half[i] = clamp&lt;u32&gt;(temp, 0, 65535);\n  } else {\n    u32 temp = (u32)a.word[i - 4] &gt;&gt; imm;\n    dst.half[i] = clamp&lt;u32&gt;(temp, 0, 65535);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vssrlni_w_d (__m128i a, __m128i b, imm0_63 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vssrlni_w_d (__m128i a, __m128i b, imm0_63 imm)\n#include <lsxintrin.h>\nInstruction: vssrlni.w.d vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nLogical right shift the unsigned 64-bit elements in `a` and `b` by `imm`, clamp to fit in signed 32-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (i < 2) {\n    u64 temp = (u64)b.dword[i] >> imm;\n    dst.word[i] = clamp<u64>(temp, 0, 2147483647);\n  } else {\n    u64 temp = (u64)a.dword[i - 2] >> imm;\n    dst.word[i] = clamp<u64>(temp, 0, 2147483647);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vssrlni_w_d (__m128i a, __m128i b, imm0_63 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vssrlni.w.d vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift the unsigned 64-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in signed 32-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (i &lt; 2) {\n    u64 temp = (u64)b.dword[i] &gt;&gt; imm;\n    dst.word[i] = clamp&lt;u64&gt;(temp, 0, 2147483647);\n  } else {\n    u64 temp = (u64)a.dword[i - 2] &gt;&gt; imm;\n    dst.word[i] = clamp&lt;u64&gt;(temp, 0, 2147483647);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vssrlni_wu_d (__m128i a, __m128i b, imm0_63 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vssrlni_wu_d (__m128i a, __m128i b, imm0_63 imm)\n#include <lsxintrin.h>\nInstruction: vssrlni.wu.d vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nLogical right shift the unsigned 64-bit elements in `a` and `b` by `imm`, clamp to fit in unsigned 32-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (i < 2) {\n    u64 temp = (u64)b.dword[i] >> imm;\n    dst.word[i] = clamp<u64>(temp, 0, 4294967295);\n  } else {\n    u64 temp = (u64)a.dword[i - 2] >> imm;\n    dst.word[i] = clamp<u64>(temp, 0, 4294967295);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vssrlni_wu_d (__m128i a, __m128i b, imm0_63 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vssrlni.wu.d vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift the unsigned 64-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in unsigned 32-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (i &lt; 2) {\n    u64 temp = (u64)b.dword[i] &gt;&gt; imm;\n    dst.word[i] = clamp&lt;u64&gt;(temp, 0, 4294967295);\n  } else {\n    u64 temp = (u64)a.dword[i - 2] &gt;&gt; imm;\n    dst.word[i] = clamp&lt;u64&gt;(temp, 0, 4294967295);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vssrlrn_b_h (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vssrlrn_b_h (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vssrlrn.b.h vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nLogical right shift (with rounding) the unsigned 16-bit elements in `a` by elements in `b`, clamp to fit in signed 8-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  if (i < 8) {\n    u16 temp;\n    if ((b.half[i] & 15) == 0) {\n      temp = (u16)a.half[i];\n    } else {\n      temp = ((u16)a.half[i] >> (b.half[i] & 15)) +\n             (((u16)a.half[i] >> ((b.half[i] & 15) - 1)) & 1);\n    }\n    dst.byte[i] = clamp<u16>(temp, 0, 127);\n  } else {\n    dst.byte[i] = 0;\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vssrlrn_b_h (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vssrlrn.b.h vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift (with rounding) the unsigned 16-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in signed 8-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  if (i &lt; 8) {\n    u16 temp;\n    if ((b.half[i] &amp; 15) == 0) {\n      temp = (u16)a.half[i];\n    } else {\n      temp = ((u16)a.half[i] &gt;&gt; (b.half[i] &amp; 15)) +\n             (((u16)a.half[i] &gt;&gt; ((b.half[i] &amp; 15) - 1)) &amp; 1);\n    }\n    dst.byte[i] = clamp&lt;u16&gt;(temp, 0, 127);\n  } else {\n    dst.byte[i] = 0;\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vssrlrn_bu_h (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vssrlrn_bu_h (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vssrlrn.bu.h vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nLogical right shift (with rounding) the unsigned 16-bit elements in `a` by elements in `b`, clamp to fit in unsigned 8-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  if (i < 8) {\n    u16 temp;\n    if ((b.half[i] & 15) == 0) {\n      temp = (u16)a.half[i];\n    } else {\n      temp = ((u16)a.half[i] >> (b.half[i] & 15)) +\n             (((u16)a.half[i] >> ((b.half[i] & 15) - 1)) & 1);\n    }\n    dst.byte[i] = clamp<u16>(temp, 0, 255);\n  } else {\n    dst.byte[i] = 0;\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vssrlrn_bu_h (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vssrlrn.bu.h vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift (with rounding) the unsigned 16-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in unsigned 8-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  if (i &lt; 8) {\n    u16 temp;\n    if ((b.half[i] &amp; 15) == 0) {\n      temp = (u16)a.half[i];\n    } else {\n      temp = ((u16)a.half[i] &gt;&gt; (b.half[i] &amp; 15)) +\n             (((u16)a.half[i] &gt;&gt; ((b.half[i] &amp; 15) - 1)) &amp; 1);\n    }\n    dst.byte[i] = clamp&lt;u16&gt;(temp, 0, 255);\n  } else {\n    dst.byte[i] = 0;\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vssrlrn_h_w (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vssrlrn_h_w (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vssrlrn.h.w vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nLogical right shift (with rounding) the unsigned 32-bit elements in `a` by elements in `b`, clamp to fit in signed 16-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  if (i < 4) {\n    u32 temp;\n    if ((b.word[i] & 31) == 0) {\n      temp = (u32)a.word[i];\n    } else {\n      temp = ((u32)a.word[i] >> (b.word[i] & 31)) +\n             (((u32)a.word[i] >> ((b.word[i] & 31) - 1)) & 1);\n    }\n    dst.half[i] = clamp<u32>(temp, 0, 32767);\n  } else {\n    dst.half[i] = 0;\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vssrlrn_h_w (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vssrlrn.h.w vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift (with rounding) the unsigned 32-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in signed 16-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  if (i &lt; 4) {\n    u32 temp;\n    if ((b.word[i] &amp; 31) == 0) {\n      temp = (u32)a.word[i];\n    } else {\n      temp = ((u32)a.word[i] &gt;&gt; (b.word[i] &amp; 31)) +\n             (((u32)a.word[i] &gt;&gt; ((b.word[i] &amp; 31) - 1)) &amp; 1);\n    }\n    dst.half[i] = clamp&lt;u32&gt;(temp, 0, 32767);\n  } else {\n    dst.half[i] = 0;\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vssrlrn_hu_w (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vssrlrn_hu_w (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vssrlrn.hu.w vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nLogical right shift (with rounding) the unsigned 32-bit elements in `a` by elements in `b`, clamp to fit in unsigned 16-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  if (i < 4) {\n    u32 temp;\n    if ((b.word[i] & 31) == 0) {\n      temp = (u32)a.word[i];\n    } else {\n      temp = ((u32)a.word[i] >> (b.word[i] & 31)) +\n             (((u32)a.word[i] >> ((b.word[i] & 31) - 1)) & 1);\n    }\n    dst.half[i] = clamp<u32>(temp, 0, 65535);\n  } else {\n    dst.half[i] = 0;\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vssrlrn_hu_w (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vssrlrn.hu.w vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift (with rounding) the unsigned 32-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in unsigned 16-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  if (i &lt; 4) {\n    u32 temp;\n    if ((b.word[i] &amp; 31) == 0) {\n      temp = (u32)a.word[i];\n    } else {\n      temp = ((u32)a.word[i] &gt;&gt; (b.word[i] &amp; 31)) +\n             (((u32)a.word[i] &gt;&gt; ((b.word[i] &amp; 31) - 1)) &amp; 1);\n    }\n    dst.half[i] = clamp&lt;u32&gt;(temp, 0, 65535);\n  } else {\n    dst.half[i] = 0;\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vssrlrn_w_d (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vssrlrn_w_d (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vssrlrn.w.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nLogical right shift (with rounding) the unsigned 64-bit elements in `a` by elements in `b`, clamp to fit in signed 32-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (i < 2) {\n    u64 temp;\n    if ((b.dword[i] & 63) == 0) {\n      temp = (u64)a.dword[i];\n    } else {\n      temp = ((u64)a.dword[i] >> (b.dword[i] & 63)) +\n             (((u64)a.dword[i] >> ((b.dword[i] & 63) - 1)) & 1);\n    }\n    dst.word[i] = clamp<u64>(temp, 0, 2147483647);\n  } else {\n    dst.word[i] = 0;\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vssrlrn_w_d (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vssrlrn.w.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift (with rounding) the unsigned 64-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in signed 32-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (i &lt; 2) {\n    u64 temp;\n    if ((b.dword[i] &amp; 63) == 0) {\n      temp = (u64)a.dword[i];\n    } else {\n      temp = ((u64)a.dword[i] &gt;&gt; (b.dword[i] &amp; 63)) +\n             (((u64)a.dword[i] &gt;&gt; ((b.dword[i] &amp; 63) - 1)) &amp; 1);\n    }\n    dst.word[i] = clamp&lt;u64&gt;(temp, 0, 2147483647);\n  } else {\n    dst.word[i] = 0;\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vssrlrn_wu_d (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vssrlrn_wu_d (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vssrlrn.wu.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nLogical right shift (with rounding) the unsigned 64-bit elements in `a` by elements in `b`, clamp to fit in unsigned 32-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (i < 2) {\n    u64 temp;\n    if ((b.dword[i] & 63) == 0) {\n      temp = (u64)a.dword[i];\n    } else {\n      temp = ((u64)a.dword[i] >> (b.dword[i] & 63)) +\n             (((u64)a.dword[i] >> ((b.dword[i] & 63) - 1)) & 1);\n    }\n    dst.word[i] = clamp<u64>(temp, 0, 4294967295);\n  } else {\n    dst.word[i] = 0;\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vssrlrn_wu_d (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vssrlrn.wu.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift (with rounding) the unsigned 64-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in unsigned 32-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (i &lt; 2) {\n    u64 temp;\n    if ((b.dword[i] &amp; 63) == 0) {\n      temp = (u64)a.dword[i];\n    } else {\n      temp = ((u64)a.dword[i] &gt;&gt; (b.dword[i] &amp; 63)) +\n             (((u64)a.dword[i] &gt;&gt; ((b.dword[i] &amp; 63) - 1)) &amp; 1);\n    }\n    dst.word[i] = clamp&lt;u64&gt;(temp, 0, 4294967295);\n  } else {\n    dst.word[i] = 0;\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vssrlrni_b_h (__m128i a, __m128i b, imm0_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vssrlrni_b_h (__m128i a, __m128i b, imm0_15 imm)\n#include <lsxintrin.h>\nInstruction: vssrlrni.b.h vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nLogical right shift (with rounding) the unsigned 16-bit elements in `a` and `b` by `imm`, clamp to fit in signed 8-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  if (i < 8) {\n    u16 temp;\n    if (imm == 0) {\n      temp = (u16)b.half[i];\n    } else {\n      temp = ((u16)b.half[i] >> imm) + (((u16)b.half[i] >> (imm - 1)) & 1);\n    }\n    dst.byte[i] = clamp<u16>(temp, 0, 127);\n  } else {\n    u16 temp;\n    if (imm == 0) {\n      temp = (u16)a.half[i - 8];\n    } else {\n      temp =\n          ((u16)a.half[i - 8] >> imm) + (((u16)a.half[i - 8] >> (imm - 1)) & 1);\n    }\n    dst.byte[i] = clamp<u16>(temp, 0, 127);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vssrlrni_b_h (__m128i a, __m128i b, imm0_15 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vssrlrni.b.h vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift (with rounding) the unsigned 16-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in signed 8-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  if (i &lt; 8) {\n    u16 temp;\n    if (imm == 0) {\n      temp = (u16)b.half[i];\n    } else {\n      temp = ((u16)b.half[i] &gt;&gt; imm) + (((u16)b.half[i] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.byte[i] = clamp&lt;u16&gt;(temp, 0, 127);\n  } else {\n    u16 temp;\n    if (imm == 0) {\n      temp = (u16)a.half[i - 8];\n    } else {\n      temp =\n          ((u16)a.half[i - 8] &gt;&gt; imm) + (((u16)a.half[i - 8] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.byte[i] = clamp&lt;u16&gt;(temp, 0, 127);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vssrlrni_bu_h (__m128i a, __m128i b, imm0_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vssrlrni_bu_h (__m128i a, __m128i b, imm0_15 imm)\n#include <lsxintrin.h>\nInstruction: vssrlrni.bu.h vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nLogical right shift (with rounding) the unsigned 16-bit elements in `a` and `b` by `imm`, clamp to fit in unsigned 8-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  if (i < 8) {\n    u16 temp;\n    if (imm == 0) {\n      temp = (u16)b.half[i];\n    } else {\n      temp = ((u16)b.half[i] >> imm) + (((u16)b.half[i] >> (imm - 1)) & 1);\n    }\n    dst.byte[i] = clamp<u16>(temp, 0, 255);\n  } else {\n    u16 temp;\n    if (imm == 0) {\n      temp = (u16)a.half[i - 8];\n    } else {\n      temp =\n          ((u16)a.half[i - 8] >> imm) + (((u16)a.half[i - 8] >> (imm - 1)) & 1);\n    }\n    dst.byte[i] = clamp<u16>(temp, 0, 255);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vssrlrni_bu_h (__m128i a, __m128i b, imm0_15 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vssrlrni.bu.h vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift (with rounding) the unsigned 16-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in unsigned 8-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  if (i &lt; 8) {\n    u16 temp;\n    if (imm == 0) {\n      temp = (u16)b.half[i];\n    } else {\n      temp = ((u16)b.half[i] &gt;&gt; imm) + (((u16)b.half[i] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.byte[i] = clamp&lt;u16&gt;(temp, 0, 255);\n  } else {\n    u16 temp;\n    if (imm == 0) {\n      temp = (u16)a.half[i - 8];\n    } else {\n      temp =\n          ((u16)a.half[i - 8] &gt;&gt; imm) + (((u16)a.half[i - 8] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.byte[i] = clamp&lt;u16&gt;(temp, 0, 255);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vssrlrni_d_q (__m128i a, __m128i b, imm0_127 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vssrlrni_d_q (__m128i a, __m128i b, imm0_127 imm)\n#include <lsxintrin.h>\nInstruction: vssrlrni.d.q vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nLogical right shift (with rounding) the unsigned 128-bit elements in `a` and `b` by `imm`, clamp to fit in signed 64-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  if (i < 1) {\n    u128 temp;\n    if (imm == 0) {\n      temp = (u128)b.qword[i];\n    } else {\n      temp = ((u128)b.qword[i] >> imm) + (((u128)b.qword[i] >> (imm - 1)) & 1);\n    }\n    dst.dword[i] = clamp<u128>(temp, 0, 9223372036854775807);\n  } else {\n    u128 temp;\n    if (imm == 0) {\n      temp = (u128)a.qword[i - 1];\n    } else {\n      temp = ((u128)a.qword[i - 1] >> imm) +\n             (((u128)a.qword[i - 1] >> (imm - 1)) & 1);\n    }\n    dst.dword[i] = clamp<u128>(temp, 0, 9223372036854775807);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vssrlrni_d_q (__m128i a, __m128i b, imm0_127 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vssrlrni.d.q vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift (with rounding) the unsigned 128-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in signed 64-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  if (i &lt; 1) {\n    u128 temp;\n    if (imm == 0) {\n      temp = (u128)b.qword[i];\n    } else {\n      temp = ((u128)b.qword[i] &gt;&gt; imm) + (((u128)b.qword[i] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.dword[i] = clamp&lt;u128&gt;(temp, 0, 9223372036854775807);\n  } else {\n    u128 temp;\n    if (imm == 0) {\n      temp = (u128)a.qword[i - 1];\n    } else {\n      temp = ((u128)a.qword[i - 1] &gt;&gt; imm) +\n             (((u128)a.qword[i - 1] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.dword[i] = clamp&lt;u128&gt;(temp, 0, 9223372036854775807);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vssrlrni_du_q (__m128i a, __m128i b, imm0_127 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vssrlrni_du_q (__m128i a, __m128i b, imm0_127 imm)\n#include <lsxintrin.h>\nInstruction: vssrlrni.du.q vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nLogical right shift (with rounding) the unsigned 128-bit elements in `a` and `b` by `imm`, clamp to fit in unsigned 64-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  if (i < 1) {\n    u128 temp;\n    if (imm == 0) {\n      temp = (u128)b.qword[i];\n    } else {\n      temp = ((u128)b.qword[i] >> imm) + (((u128)b.qword[i] >> (imm - 1)) & 1);\n    }\n    dst.dword[i] = clamp<u128>(temp, 0, 18446744073709551615);\n  } else {\n    u128 temp;\n    if (imm == 0) {\n      temp = (u128)a.qword[i - 1];\n    } else {\n      temp = ((u128)a.qword[i - 1] >> imm) +\n             (((u128)a.qword[i - 1] >> (imm - 1)) & 1);\n    }\n    dst.dword[i] = clamp<u128>(temp, 0, 18446744073709551615);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vssrlrni_du_q (__m128i a, __m128i b, imm0_127 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vssrlrni.du.q vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift (with rounding) the unsigned 128-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in unsigned 64-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  if (i &lt; 1) {\n    u128 temp;\n    if (imm == 0) {\n      temp = (u128)b.qword[i];\n    } else {\n      temp = ((u128)b.qword[i] &gt;&gt; imm) + (((u128)b.qword[i] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.dword[i] = clamp&lt;u128&gt;(temp, 0, 18446744073709551615);\n  } else {\n    u128 temp;\n    if (imm == 0) {\n      temp = (u128)a.qword[i - 1];\n    } else {\n      temp = ((u128)a.qword[i - 1] &gt;&gt; imm) +\n             (((u128)a.qword[i - 1] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.dword[i] = clamp&lt;u128&gt;(temp, 0, 18446744073709551615);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vssrlrni_h_w (__m128i a, __m128i b, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vssrlrni_h_w (__m128i a, __m128i b, imm0_31 imm)\n#include <lsxintrin.h>\nInstruction: vssrlrni.h.w vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nLogical right shift (with rounding) the unsigned 32-bit elements in `a` and `b` by `imm`, clamp to fit in signed 16-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  if (i < 4) {\n    u32 temp;\n    if (imm == 0) {\n      temp = (u32)b.word[i];\n    } else {\n      temp = ((u32)b.word[i] >> imm) + (((u32)b.word[i] >> (imm - 1)) & 1);\n    }\n    dst.half[i] = clamp<u32>(temp, 0, 32767);\n  } else {\n    u32 temp;\n    if (imm == 0) {\n      temp = (u32)a.word[i - 4];\n    } else {\n      temp =\n          ((u32)a.word[i - 4] >> imm) + (((u32)a.word[i - 4] >> (imm - 1)) & 1);\n    }\n    dst.half[i] = clamp<u32>(temp, 0, 32767);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vssrlrni_h_w (__m128i a, __m128i b, imm0_31 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vssrlrni.h.w vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift (with rounding) the unsigned 32-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in signed 16-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  if (i &lt; 4) {\n    u32 temp;\n    if (imm == 0) {\n      temp = (u32)b.word[i];\n    } else {\n      temp = ((u32)b.word[i] &gt;&gt; imm) + (((u32)b.word[i] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.half[i] = clamp&lt;u32&gt;(temp, 0, 32767);\n  } else {\n    u32 temp;\n    if (imm == 0) {\n      temp = (u32)a.word[i - 4];\n    } else {\n      temp =\n          ((u32)a.word[i - 4] &gt;&gt; imm) + (((u32)a.word[i - 4] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.half[i] = clamp&lt;u32&gt;(temp, 0, 32767);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vssrlrni_hu_w (__m128i a, __m128i b, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vssrlrni_hu_w (__m128i a, __m128i b, imm0_31 imm)\n#include <lsxintrin.h>\nInstruction: vssrlrni.hu.w vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nLogical right shift (with rounding) the unsigned 32-bit elements in `a` and `b` by `imm`, clamp to fit in unsigned 16-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  if (i < 4) {\n    u32 temp;\n    if (imm == 0) {\n      temp = (u32)b.word[i];\n    } else {\n      temp = ((u32)b.word[i] >> imm) + (((u32)b.word[i] >> (imm - 1)) & 1);\n    }\n    dst.half[i] = clamp<u32>(temp, 0, 65535);\n  } else {\n    u32 temp;\n    if (imm == 0) {\n      temp = (u32)a.word[i - 4];\n    } else {\n      temp =\n          ((u32)a.word[i - 4] >> imm) + (((u32)a.word[i - 4] >> (imm - 1)) & 1);\n    }\n    dst.half[i] = clamp<u32>(temp, 0, 65535);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vssrlrni_hu_w (__m128i a, __m128i b, imm0_31 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vssrlrni.hu.w vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift (with rounding) the unsigned 32-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in unsigned 16-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  if (i &lt; 4) {\n    u32 temp;\n    if (imm == 0) {\n      temp = (u32)b.word[i];\n    } else {\n      temp = ((u32)b.word[i] &gt;&gt; imm) + (((u32)b.word[i] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.half[i] = clamp&lt;u32&gt;(temp, 0, 65535);\n  } else {\n    u32 temp;\n    if (imm == 0) {\n      temp = (u32)a.word[i - 4];\n    } else {\n      temp =\n          ((u32)a.word[i - 4] &gt;&gt; imm) + (((u32)a.word[i - 4] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.half[i] = clamp&lt;u32&gt;(temp, 0, 65535);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vssrlrni_w_d (__m128i a, __m128i b, imm0_63 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vssrlrni_w_d (__m128i a, __m128i b, imm0_63 imm)\n#include <lsxintrin.h>\nInstruction: vssrlrni.w.d vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nLogical right shift (with rounding) the unsigned 64-bit elements in `a` and `b` by `imm`, clamp to fit in signed 32-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (i < 2) {\n    u64 temp;\n    if (imm == 0) {\n      temp = (u64)b.dword[i];\n    } else {\n      temp = ((u64)b.dword[i] >> imm) + (((u64)b.dword[i] >> (imm - 1)) & 1);\n    }\n    dst.word[i] = clamp<u64>(temp, 0, 2147483647);\n  } else {\n    u64 temp;\n    if (imm == 0) {\n      temp = (u64)a.dword[i - 2];\n    } else {\n      temp = ((u64)a.dword[i - 2] >> imm) +\n             (((u64)a.dword[i - 2] >> (imm - 1)) & 1);\n    }\n    dst.word[i] = clamp<u64>(temp, 0, 2147483647);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vssrlrni_w_d (__m128i a, __m128i b, imm0_63 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vssrlrni.w.d vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift (with rounding) the unsigned 64-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in signed 32-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (i &lt; 2) {\n    u64 temp;\n    if (imm == 0) {\n      temp = (u64)b.dword[i];\n    } else {\n      temp = ((u64)b.dword[i] &gt;&gt; imm) + (((u64)b.dword[i] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.word[i] = clamp&lt;u64&gt;(temp, 0, 2147483647);\n  } else {\n    u64 temp;\n    if (imm == 0) {\n      temp = (u64)a.dword[i - 2];\n    } else {\n      temp = ((u64)a.dword[i - 2] &gt;&gt; imm) +\n             (((u64)a.dword[i - 2] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.word[i] = clamp&lt;u64&gt;(temp, 0, 2147483647);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vssrlrni_wu_d (__m128i a, __m128i b, imm0_63 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vssrlrni_wu_d (__m128i a, __m128i b, imm0_63 imm)\n#include <lsxintrin.h>\nInstruction: vssrlrni.wu.d vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nLogical right shift (with rounding) the unsigned 64-bit elements in `a` and `b` by `imm`, clamp to fit in unsigned 32-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (i < 2) {\n    u64 temp;\n    if (imm == 0) {\n      temp = (u64)b.dword[i];\n    } else {\n      temp = ((u64)b.dword[i] >> imm) + (((u64)b.dword[i] >> (imm - 1)) & 1);\n    }\n    dst.word[i] = clamp<u64>(temp, 0, 4294967295);\n  } else {\n    u64 temp;\n    if (imm == 0) {\n      temp = (u64)a.dword[i - 2];\n    } else {\n      temp = ((u64)a.dword[i - 2] >> imm) +\n             (((u64)a.dword[i - 2] >> (imm - 1)) & 1);\n    }\n    dst.word[i] = clamp<u64>(temp, 0, 4294967295);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vssrlrni_wu_d (__m128i a, __m128i b, imm0_63 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vssrlrni.wu.d vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift (with rounding) the unsigned 64-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in unsigned 32-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (i &lt; 2) {\n    u64 temp;\n    if (imm == 0) {\n      temp = (u64)b.dword[i];\n    } else {\n      temp = ((u64)b.dword[i] &gt;&gt; imm) + (((u64)b.dword[i] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.word[i] = clamp&lt;u64&gt;(temp, 0, 4294967295);\n  } else {\n    u64 temp;\n    if (imm == 0) {\n      temp = (u64)a.dword[i - 2];\n    } else {\n      temp = ((u64)a.dword[i - 2] &gt;&gt; imm) +\n             (((u64)a.dword[i - 2] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.word[i] = clamp&lt;u64&gt;(temp, 0, 4294967295);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vssub_b (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vssub_b (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vssub.b vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nSaturing subtract the signed 8-bit elements in `a` and `b`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = (s8)ssub((s8)a.byte[i], (s8)b.byte[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vssub_b (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vssub.b vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Saturing subtract the signed 8-bit elements in <code>a</code> and <code>b</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = (s8)ssub((s8)a.byte[i], (s8)b.byte[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vssub_bu (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vssub_bu (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vssub.bu vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nSaturing subtract the unsigned 8-bit elements in `a` and `b`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = (u8)ssub((u8)a.byte[i], (u8)b.byte[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vssub_bu (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vssub.bu vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Saturing subtract the unsigned 8-bit elements in <code>a</code> and <code>b</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = (u8)ssub((u8)a.byte[i], (u8)b.byte[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vssub_d (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vssub_d (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vssub.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nSaturing subtract the signed 64-bit elements in `a` and `b`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = (s64)ssub((s64)a.dword[i], (s64)b.dword[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vssub_d (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vssub.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Saturing subtract the signed 64-bit elements in <code>a</code> and <code>b</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = (s64)ssub((s64)a.dword[i], (s64)b.dword[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vssub_du (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vssub_du (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vssub.du vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nSaturing subtract the unsigned 64-bit elements in `a` and `b`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = (u64)ssub((u64)a.dword[i], (u64)b.dword[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vssub_du (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vssub.du vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Saturing subtract the unsigned 64-bit elements in <code>a</code> and <code>b</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = (u64)ssub((u64)a.dword[i], (u64)b.dword[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vssub_h (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vssub_h (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vssub.h vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nSaturing subtract the signed 16-bit elements in `a` and `b`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = (s16)ssub((s16)a.half[i], (s16)b.half[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vssub_h (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vssub.h vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Saturing subtract the signed 16-bit elements in <code>a</code> and <code>b</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = (s16)ssub((s16)a.half[i], (s16)b.half[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vssub_hu (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vssub_hu (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vssub.hu vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nSaturing subtract the unsigned 16-bit elements in `a` and `b`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = (u16)ssub((u16)a.half[i], (u16)b.half[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vssub_hu (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vssub.hu vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Saturing subtract the unsigned 16-bit elements in <code>a</code> and <code>b</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = (u16)ssub((u16)a.half[i], (u16)b.half[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vssub_w (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vssub_w (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vssub.w vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nSaturing subtract the signed 32-bit elements in `a` and `b`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = (s32)ssub((s32)a.word[i], (s32)b.word[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vssub_w (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vssub.w vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Saturing subtract the signed 32-bit elements in <code>a</code> and <code>b</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = (s32)ssub((s32)a.word[i], (s32)b.word[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vssub_wu (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vssub_wu (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vssub.wu vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nSaturing subtract the unsigned 32-bit elements in `a` and `b`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = (u32)ssub((u32)a.word[i], (u32)b.word[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vssub_wu (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vssub.wu vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Saturing subtract the unsigned 32-bit elements in <code>a</code> and <code>b</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = (u32)ssub((u32)a.word[i], (u32)b.word[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsub_b (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsub_b (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vsub.b vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nSubtract 8-bit elements in `a` and `b`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = a.byte[i] - b.byte[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsub_b (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsub.b vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Subtract 8-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = a.byte[i] - b.byte[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsub_d (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsub_d (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vsub.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nSubtract 64-bit elements in `a` and `b`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = a.dword[i] - b.dword[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsub_d (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsub.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Subtract 64-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = a.dword[i] - b.dword[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsub_h (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsub_h (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vsub.h vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nSubtract 16-bit elements in `a` and `b`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = a.half[i] - b.half[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsub_h (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsub.h vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Subtract 16-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = a.half[i] - b.half[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsub_q (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsub_q (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vsub.q vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nSubtract 128-bit elements in `a` and `b`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\ndst.qword[0] = a.qword[0] - b.qword[0];\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsub_q (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsub.q vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Subtract 128-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">dst.qword[0] = a.qword[0] - b.qword[0];\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsub_w (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsub_w (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vsub.w vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nSubtract 32-bit elements in `a` and `b`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = a.word[i] - b.word[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsub_w (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsub.w vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Subtract 32-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = a.word[i] - b.word[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsubi_bu (__m128i a, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsubi_bu (__m128i a, imm0_31 imm)\n#include <lsxintrin.h>\nInstruction: vsubi.bu vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nSubtract 8-bit elements in `a` by `imm`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = a.byte[i] - imm;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsubi_bu (__m128i a, imm0_31 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsubi.bu vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Subtract 8-bit elements in <code>a</code> by <code>imm</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = a.byte[i] - imm;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsubi_du (__m128i a, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsubi_du (__m128i a, imm0_31 imm)\n#include <lsxintrin.h>\nInstruction: vsubi.du vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nSubtract 64-bit elements in `a` by `imm`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = a.dword[i] - imm;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsubi_du (__m128i a, imm0_31 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsubi.du vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Subtract 64-bit elements in <code>a</code> by <code>imm</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = a.dword[i] - imm;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsubi_hu (__m128i a, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsubi_hu (__m128i a, imm0_31 imm)\n#include <lsxintrin.h>\nInstruction: vsubi.hu vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nSubtract 16-bit elements in `a` by `imm`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = a.half[i] - imm;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsubi_hu (__m128i a, imm0_31 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsubi.hu vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Subtract 16-bit elements in <code>a</code> by <code>imm</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = a.half[i] - imm;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsubi_wu (__m128i a, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsubi_wu (__m128i a, imm0_31 imm)\n#include <lsxintrin.h>\nInstruction: vsubi.wu vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nSubtract 32-bit elements in `a` by `imm`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = a.word[i] - imm;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsubi_wu (__m128i a, imm0_31 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsubi.wu vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Subtract 32-bit elements in <code>a</code> by <code>imm</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = a.word[i] - imm;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsubwev_d_w (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsubwev_d_w (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vsubwev.d.w vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nSubtract even-positioned signed 32-bit elements in `a` and signed elements in `b`, save the 64-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = (s64)(s32)a.word[2 * i] - (s64)(s32)b.word[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsubwev_d_w (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsubwev.d.w vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Subtract even-positioned signed 32-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 64-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = (s64)(s32)a.word[2 * i] - (s64)(s32)b.word[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsubwev_d_wu (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsubwev_d_wu (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vsubwev.d.wu vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nSubtract even-positioned unsigned 32-bit elements in `a` and unsigned elements in `b`, save the 64-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = (u64)(u32)a.word[2 * i] - (u64)(u32)b.word[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsubwev_d_wu (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsubwev.d.wu vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Subtract even-positioned unsigned 32-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 64-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = (u64)(u32)a.word[2 * i] - (u64)(u32)b.word[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsubwev_h_b (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsubwev_h_b (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vsubwev.h.b vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nSubtract even-positioned signed 8-bit elements in `a` and signed elements in `b`, save the 16-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = (s16)(s8)a.byte[2 * i] - (s16)(s8)b.byte[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsubwev_h_b (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsubwev.h.b vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Subtract even-positioned signed 8-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 16-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = (s16)(s8)a.byte[2 * i] - (s16)(s8)b.byte[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsubwev_h_bu (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsubwev_h_bu (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vsubwev.h.bu vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nSubtract even-positioned unsigned 8-bit elements in `a` and unsigned elements in `b`, save the 16-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = (u16)(u8)a.byte[2 * i] - (u16)(u8)b.byte[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsubwev_h_bu (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsubwev.h.bu vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Subtract even-positioned unsigned 8-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 16-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = (u16)(u8)a.byte[2 * i] - (u16)(u8)b.byte[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsubwev_q_d (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsubwev_q_d (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vsubwev.q.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nSubtract even-positioned signed 64-bit elements in `a` and signed elements in `b`, save the 128-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 1; i++) {\n  dst.qword[i] = (s128)(s64)a.dword[2 * i] - (s128)(s64)b.dword[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsubwev_q_d (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsubwev.q.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Subtract even-positioned signed 64-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 128-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 1; i++) {\n  dst.qword[i] = (s128)(s64)a.dword[2 * i] - (s128)(s64)b.dword[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsubwev_q_du (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsubwev_q_du (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vsubwev.q.du vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nSubtract even-positioned unsigned 64-bit elements in `a` and unsigned elements in `b`, save the 128-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 1; i++) {\n  dst.qword[i] = (u128)(u64)a.dword[2 * i] - (u128)(u64)b.dword[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsubwev_q_du (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsubwev.q.du vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Subtract even-positioned unsigned 64-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 128-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 1; i++) {\n  dst.qword[i] = (u128)(u64)a.dword[2 * i] - (u128)(u64)b.dword[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsubwev_w_h (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsubwev_w_h (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vsubwev.w.h vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nSubtract even-positioned signed 16-bit elements in `a` and signed elements in `b`, save the 32-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = (s32)(s16)a.half[2 * i] - (s32)(s16)b.half[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsubwev_w_h (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsubwev.w.h vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Subtract even-positioned signed 16-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 32-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = (s32)(s16)a.half[2 * i] - (s32)(s16)b.half[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsubwev_w_hu (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsubwev_w_hu (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vsubwev.w.hu vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nSubtract even-positioned unsigned 16-bit elements in `a` and unsigned elements in `b`, save the 32-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = (u32)(u16)a.half[2 * i] - (u32)(u16)b.half[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsubwev_w_hu (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsubwev.w.hu vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Subtract even-positioned unsigned 16-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 32-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = (u32)(u16)a.half[2 * i] - (u32)(u16)b.half[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsubwod_d_w (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsubwod_d_w (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vsubwod.d.w vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nSubtract odd-positioned signed 32-bit elements in `a` and signed elements in `b`, save the 64-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = (s64)(s32)a.word[2 * i + 1] - (s64)(s32)b.word[2 * i + 1];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsubwod_d_w (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsubwod.d.w vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Subtract odd-positioned signed 32-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 64-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = (s64)(s32)a.word[2 * i + 1] - (s64)(s32)b.word[2 * i + 1];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsubwod_d_wu (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsubwod_d_wu (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vsubwod.d.wu vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nSubtract odd-positioned unsigned 32-bit elements in `a` and unsigned elements in `b`, save the 64-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = (u64)(u32)a.word[2 * i + 1] - (u64)(u32)b.word[2 * i + 1];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsubwod_d_wu (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsubwod.d.wu vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Subtract odd-positioned unsigned 32-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 64-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = (u64)(u32)a.word[2 * i + 1] - (u64)(u32)b.word[2 * i + 1];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsubwod_h_b (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsubwod_h_b (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vsubwod.h.b vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nSubtract odd-positioned signed 8-bit elements in `a` and signed elements in `b`, save the 16-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = (s16)(s8)a.byte[2 * i + 1] - (s16)(s8)b.byte[2 * i + 1];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsubwod_h_b (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsubwod.h.b vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Subtract odd-positioned signed 8-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 16-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = (s16)(s8)a.byte[2 * i + 1] - (s16)(s8)b.byte[2 * i + 1];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsubwod_h_bu (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsubwod_h_bu (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vsubwod.h.bu vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nSubtract odd-positioned unsigned 8-bit elements in `a` and unsigned elements in `b`, save the 16-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = (u16)(u8)a.byte[2 * i + 1] - (u16)(u8)b.byte[2 * i + 1];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsubwod_h_bu (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsubwod.h.bu vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Subtract odd-positioned unsigned 8-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 16-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = (u16)(u8)a.byte[2 * i + 1] - (u16)(u8)b.byte[2 * i + 1];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsubwod_q_d (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsubwod_q_d (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vsubwod.q.d vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nSubtract odd-positioned signed 64-bit elements in `a` and signed elements in `b`, save the 128-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 1; i++) {\n  dst.qword[i] = (s128)(s64)a.dword[2 * i + 1] - (s128)(s64)b.dword[2 * i + 1];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsubwod_q_d (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsubwod.q.d vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Subtract odd-positioned signed 64-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 128-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 1; i++) {\n  dst.qword[i] = (s128)(s64)a.dword[2 * i + 1] - (s128)(s64)b.dword[2 * i + 1];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsubwod_q_du (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsubwod_q_du (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vsubwod.q.du vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nSubtract odd-positioned unsigned 64-bit elements in `a` and unsigned elements in `b`, save the 128-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 1; i++) {\n  dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] - (u128)(u64)b.dword[2 * i + 1];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsubwod_q_du (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsubwod.q.du vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Subtract odd-positioned unsigned 64-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 128-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 1; i++) {\n  dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] - (u128)(u64)b.dword[2 * i + 1];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsubwod_w_h (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsubwod_w_h (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vsubwod.w.h vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nSubtract odd-positioned signed 16-bit elements in `a` and signed elements in `b`, save the 32-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = (s32)(s16)a.half[2 * i + 1] - (s32)(s16)b.half[2 * i + 1];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsubwod_w_h (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsubwod.w.h vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Subtract odd-positioned signed 16-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 32-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = (s32)(s16)a.half[2 * i + 1] - (s32)(s16)b.half[2 * i + 1];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vsubwod_w_hu (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vsubwod_w_hu (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vsubwod.w.hu vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nSubtract odd-positioned unsigned 16-bit elements in `a` and unsigned elements in `b`, save the 32-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = (u32)(u16)a.half[2 * i + 1] - (u32)(u16)b.half[2 * i + 1];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vsubwod_w_hu (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsubwod.w.hu vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Subtract odd-positioned unsigned 16-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 32-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = (u32)(u16)a.half[2 * i + 1] - (u32)(u16)b.half[2 * i + 1];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vxor_v (__m128i a, __m128i b)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vxor_v (__m128i a, __m128i b)\n#include <lsxintrin.h>\nInstruction: vxor.v vr, vr, vr\nCPU Flags: LSX\n```\n\n### Description\n\nCompute bitwise XOR between elements in `a` and `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = a.dword[i] ^ b.dword[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vxor_v (__m128i a, __m128i b)\n#include &lt;lsxintrin.h&gt;\nInstruction: vxor.v vr, vr, vr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute bitwise XOR between elements in <code>a</code> and <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = a.dword[i] ^ b.dword[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Logical", "extension": "LSX", "display": true}, {"name": "__m128i __lsx_vxori_b (__m128i a, imm0_255 imm)", "markdown": "### Synopsis\n\n```c++\n__m128i __lsx_vxori_b (__m128i a, imm0_255 imm)\n#include <lsxintrin.h>\nInstruction: vxori.b vr, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nCompute bitwise XOR between elements in `a` and `imm`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = a.byte[i] ^ imm;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m128i __lsx_vxori_b (__m128i a, imm0_255 imm)\n#include &lt;lsxintrin.h&gt;\nInstruction: vxori.b vr, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute bitwise XOR between elements in <code>a</code> and <code>imm</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = a.byte[i] ^ imm;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Logical", "extension": "LSX", "display": true}, {"name": "__m256 __lasx_xvfadd_s (__m256 a, __m256 b)", "markdown": "### Synopsis\n\n```c++\n__m256 __lasx_xvfadd_s (__m256 a, __m256 b)\n#include <lasxintrin.h>\nInstruction: xvfadd.s xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nAdd single precision floating point elements in `a` to elements in `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.fp32[i] = a.fp32[i] + b.fp32[i];\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 4 |\n| 3C5000 | 5 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256 __lasx_xvfadd_s (__m256 a, __m256 b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfadd.s xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add single precision floating point elements in <code>a</code> to elements in <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.fp32[i] = a.fp32[i] + b.fp32[i];\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>5</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Computation", "extension": "LASX", "display": true}, {"name": "__m256 __lasx_xvfcvt_s_d (__m256d a, __m256d b)", "markdown": "### Synopsis\n\n```c++\n__m256 __lasx_xvfcvt_s_d (__m256d a, __m256d b)\n#include <lasxintrin.h>\nInstruction: xvfcvt.s.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nConvert double precision floating point elements in `a` and `b` to single precision.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  if (i < 4) {\n    dst.fp32[i] = b.fp64[i];\n  } else {\n    dst.fp32[i] = a.fp64[i - 4];\n  }\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256 __lasx_xvfcvt_s_d (__m256d a, __m256d b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfcvt.s.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Convert double precision floating point elements in <code>a</code> and <code>b</code> to single precision.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  if (i &lt; 4) {\n    dst.fp32[i] = b.fp64[i];\n  } else {\n    dst.fp32[i] = a.fp64[i - 4];\n  }\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Conversion", "extension": "LASX", "display": true}, {"name": "__m256 __lasx_xvfcvth_s_h (__m256i a)", "markdown": "### Synopsis\n\n```c++\n__m256 __lasx_xvfcvth_s_h (__m256i a)\n#include <lasxintrin.h>\nInstruction: xvfcvth.s.h xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nConvert half precision floating point elements in higher half of `a` to single precision.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.fp32[i] = a.fp16[8 + i];\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256 __lasx_xvfcvth_s_h (__m256i a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfcvth.s.h xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Convert half precision floating point elements in higher half of <code>a</code> to single precision.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.fp32[i] = a.fp16[8 + i];\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Conversion", "extension": "LASX", "display": true}, {"name": "__m256 __lasx_xvfcvtl_s_h (__m256i a)", "markdown": "### Synopsis\n\n```c++\n__m256 __lasx_xvfcvtl_s_h (__m256i a)\n#include <lasxintrin.h>\nInstruction: xvfcvtl.s.h xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nConvert half precision floating point elements in lower half of `a` to single precision.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.fp32[i] = a.fp16[i];\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256 __lasx_xvfcvtl_s_h (__m256i a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfcvtl.s.h xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Convert half precision floating point elements in lower half of <code>a</code> to single precision.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.fp32[i] = a.fp16[i];\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Conversion", "extension": "LASX", "display": true}, {"name": "__m256 __lasx_xvfdiv_s (__m256 a, __m256 b)", "markdown": "### Synopsis\n\n```c++\n__m256 __lasx_xvfdiv_s (__m256 a, __m256 b)\n#include <lasxintrin.h>\nInstruction: xvfdiv.s xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nDivide single precision floating point elements in `a` by elements in `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.fp32[i] = a.fp32[i] / b.fp32[i];\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 11 | 0.18(1/5.5) |\n| 3C5000 | 11, 19.5 | 0.1(1/10.5) |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256 __lasx_xvfdiv_s (__m256 a, __m256 b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfdiv.s xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Divide single precision floating point elements in <code>a</code> by elements in <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.fp32[i] = a.fp32[i] / b.fp32[i];\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>11</td>\n<td>0.18(1/5.5)</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>11, 19.5</td>\n<td>0.1(1/10.5)</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Computation", "extension": "LASX", "display": true}, {"name": "__m256 __lasx_xvffint_s_l (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256 __lasx_xvffint_s_l (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvffint.s.l xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nConvert 64-bit integer elements in `a` and `b` to single-precision floating point numbers.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.fp32[i] =\n      (i < 4) ? (f32)(s32)a.dword[i]\n              : (f32)(s32)b.dword[i]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 5 | 2 |\n| 3C5000 | 5 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256 __lasx_xvffint_s_l (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvffint.s.l xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Convert 64-bit integer elements in <code>a</code> and <code>b</code> to single-precision floating point numbers.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.fp32[i] =\n      (i &lt; 4) ? (f32)(s32)a.dword[i]\n              : (f32)(s32)b.dword[i]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>5</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>5</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Conversion", "extension": "LASX", "display": true}, {"name": "__m256 __lasx_xvffint_s_w (__m256i a)", "markdown": "### Synopsis\n\n```c++\n__m256 __lasx_xvffint_s_w (__m256i a)\n#include <lasxintrin.h>\nInstruction: xvffint.s.w xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nConvert signed 32-bit integer elements in `a` to single-precision floating point numbers.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.fp32[i] = (f32)(s32)a.word[i]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 4 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256 __lasx_xvffint_s_w (__m256i a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvffint.s.w xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Convert signed 32-bit integer elements in <code>a</code> to single-precision floating point numbers.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.fp32[i] = (f32)(s32)a.word[i]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Conversion", "extension": "LASX", "display": true}, {"name": "__m256 __lasx_xvffint_s_wu (__m256i a)", "markdown": "### Synopsis\n\n```c++\n__m256 __lasx_xvffint_s_wu (__m256i a)\n#include <lasxintrin.h>\nInstruction: xvffint.s.wu xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nConvert unsigned 32-bit integer elements in `a` to single-precision floating point numbers.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.fp32[i] = (f32)(u32)a.word[i]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 4 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256 __lasx_xvffint_s_wu (__m256i a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvffint.s.wu xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Convert unsigned 32-bit integer elements in <code>a</code> to single-precision floating point numbers.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.fp32[i] = (f32)(u32)a.word[i]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Conversion", "extension": "LASX", "display": true}, {"name": "__m256 __lasx_xvflogb_s (__m256 a)", "markdown": "### Synopsis\n\n```c++\n__m256 __lasx_xvflogb_s (__m256 a)\n#include <lasxintrin.h>\nInstruction: xvflogb.s xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompute 2-based logarithm of single precision floating point elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.fp32[i] = log2(a.fp32[i]);\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 4 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256 __lasx_xvflogb_s (__m256 a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvflogb.s xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute 2-based logarithm of single precision floating point elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.fp32[i] = log2(a.fp32[i]);\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Computation", "extension": "LASX", "display": true}, {"name": "__m256 __lasx_xvfmadd_s (__m256 a, __m256 b, __m256 c)", "markdown": "### Synopsis\n\n```c++\n__m256 __lasx_xvfmadd_s (__m256 a, __m256 b, __m256 c)\n#include <lasxintrin.h>\nInstruction: xvfmadd.s xr, xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompute packed single precision floating point FMA(Fused Multiply-Add): multiply elements in `a` and `b`, accumulate to elements in `c` and store the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.fp32[i] = a.fp32[i] * b.fp32[i] + c.fp32[i];\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 5 | 2 |\n| 3C5000 | 5 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256 __lasx_xvfmadd_s (__m256 a, __m256 b, __m256 c)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfmadd.s xr, xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute packed single precision floating point FMA(Fused Multiply-Add): multiply elements in <code>a</code> and <code>b</code>, accumulate to elements in <code>c</code> and store the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.fp32[i] = a.fp32[i] * b.fp32[i] + c.fp32[i];\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>5</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>5</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Fused Multiply-Add", "extension": "LASX", "display": true}, {"name": "__m256 __lasx_xvfmax_s (__m256 a, __m256 b)", "markdown": "### Synopsis\n\n```c++\n__m256 __lasx_xvfmax_s (__m256 a, __m256 b)\n#include <lasxintrin.h>\nInstruction: xvfmax.s xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompute maximum of single precision floating point elements in `a` and `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.fp32[i] = fmax(a.fp32[i], b.fp32[i]);\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256 __lasx_xvfmax_s (__m256 a, __m256 b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfmax.s xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute maximum of single precision floating point elements in <code>a</code> and <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.fp32[i] = fmax(a.fp32[i], b.fp32[i]);\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Computation", "extension": "LASX", "display": true}, {"name": "__m256 __lasx_xvfmaxa_s (__m256 a, __m256 b)", "markdown": "### Synopsis\n\n```c++\n__m256 __lasx_xvfmaxa_s (__m256 a, __m256 b)\n#include <lasxintrin.h>\nInstruction: xvfmaxa.s xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompute maximum of single precision floating point elements in `a` and `b` by magnitude.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.fp32[i] = (abs(a.fp32[i]) > abs(b.fp32[i])) ? a.fp32[i] : b.fp32[i];\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256 __lasx_xvfmaxa_s (__m256 a, __m256 b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfmaxa.s xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute maximum of single precision floating point elements in <code>a</code> and <code>b</code> by magnitude.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.fp32[i] = (abs(a.fp32[i]) &gt; abs(b.fp32[i])) ? a.fp32[i] : b.fp32[i];\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Computation", "extension": "LASX", "display": true}, {"name": "__m256 __lasx_xvfmin_s (__m256 a, __m256 b)", "markdown": "### Synopsis\n\n```c++\n__m256 __lasx_xvfmin_s (__m256 a, __m256 b)\n#include <lasxintrin.h>\nInstruction: xvfmax.s xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompute minimum of single precision floating point elements in `a` and `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.fp32[i] = fmin(a.fp32[i], b.fp32[i]);\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256 __lasx_xvfmin_s (__m256 a, __m256 b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfmax.s xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute minimum of single precision floating point elements in <code>a</code> and <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.fp32[i] = fmin(a.fp32[i], b.fp32[i]);\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Computation", "extension": "LASX", "display": true}, {"name": "__m256 __lasx_xvfmina_s (__m256 a, __m256 b)", "markdown": "### Synopsis\n\n```c++\n__m256 __lasx_xvfmina_s (__m256 a, __m256 b)\n#include <lasxintrin.h>\nInstruction: xvfmina.s xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompute minimum of single precision floating point elements in `a` and `b` by magnitude.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.fp32[i] = (abs(a.fp32[i]) < abs(b.fp32[i])) ? a.fp32[i] : b.fp32[i];\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256 __lasx_xvfmina_s (__m256 a, __m256 b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfmina.s xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute minimum of single precision floating point elements in <code>a</code> and <code>b</code> by magnitude.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.fp32[i] = (abs(a.fp32[i]) &lt; abs(b.fp32[i])) ? a.fp32[i] : b.fp32[i];\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Computation", "extension": "LASX", "display": true}, {"name": "__m256 __lasx_xvfmsub_s (__m256 a, __m256 b, __m256 c)", "markdown": "### Synopsis\n\n```c++\n__m256 __lasx_xvfmsub_s (__m256 a, __m256 b, __m256 c)\n#include <lasxintrin.h>\nInstruction: xvfmsub.s xr, xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompute packed single precision floating point FMA(Fused Multiply-Add): multiply elements in `a` and `b`, subtract elements in `c` and store the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.fp32[i] = a.fp32[i] * b.fp32[i] - c.fp32[i];\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 5 | 2 |\n| 3C5000 | 5 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256 __lasx_xvfmsub_s (__m256 a, __m256 b, __m256 c)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfmsub.s xr, xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute packed single precision floating point FMA(Fused Multiply-Add): multiply elements in <code>a</code> and <code>b</code>, subtract elements in <code>c</code> and store the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.fp32[i] = a.fp32[i] * b.fp32[i] - c.fp32[i];\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>5</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>5</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Fused Multiply-Add", "extension": "LASX", "display": true}, {"name": "__m256 __lasx_xvfmul_s (__m256 a, __m256 b)", "markdown": "### Synopsis\n\n```c++\n__m256 __lasx_xvfmul_s (__m256 a, __m256 b)\n#include <lasxintrin.h>\nInstruction: xvfmul.s xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nMultiply single precision floating point elements in `a` and elements in `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.fp32[i] = a.fp32[i] * b.fp32[i];\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 5 | 2 |\n| 3C5000 | 5 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256 __lasx_xvfmul_s (__m256 a, __m256 b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfmul.s xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply single precision floating point elements in <code>a</code> and elements in <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.fp32[i] = a.fp32[i] * b.fp32[i];\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>5</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>5</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Computation", "extension": "LASX", "display": true}, {"name": "__m256 __lasx_xvfnmadd_s (__m256 a, __m256 b, __m256 c)", "markdown": "### Synopsis\n\n```c++\n__m256 __lasx_xvfnmadd_s (__m256 a, __m256 b, __m256 c)\n#include <lasxintrin.h>\nInstruction: xvfnmadd.s xr, xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompute packed single precision floating point FMA(Fused Multiply-Add): multiply elements in `a` and `b`, accumulate to elements in `c` and store the negated result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.fp32[i] = -(a.fp32[i] * b.fp32[i] + c.fp32[i]);\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 5 | 2 |\n| 3C5000 | 5 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256 __lasx_xvfnmadd_s (__m256 a, __m256 b, __m256 c)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfnmadd.s xr, xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute packed single precision floating point FMA(Fused Multiply-Add): multiply elements in <code>a</code> and <code>b</code>, accumulate to elements in <code>c</code> and store the negated result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.fp32[i] = -(a.fp32[i] * b.fp32[i] + c.fp32[i]);\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>5</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>5</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Fused Multiply-Add", "extension": "LASX", "display": true}, {"name": "__m256 __lasx_xvfnmsub_s (__m256 a, __m256 b, __m256 c)", "markdown": "### Synopsis\n\n```c++\n__m256 __lasx_xvfnmsub_s (__m256 a, __m256 b, __m256 c)\n#include <lasxintrin.h>\nInstruction: xvfnmsub.s xr, xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompute packed single precision floating point FMA(Fused Multiply-Add): multiply elements in `a` and `b`, subtract elements in `c` and store the negated result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.fp32[i] = -(a.fp32[i] * b.fp32[i] - c.fp32[i]);\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 5 | 2 |\n| 3C5000 | 5 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256 __lasx_xvfnmsub_s (__m256 a, __m256 b, __m256 c)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfnmsub.s xr, xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute packed single precision floating point FMA(Fused Multiply-Add): multiply elements in <code>a</code> and <code>b</code>, subtract elements in <code>c</code> and store the negated result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.fp32[i] = -(a.fp32[i] * b.fp32[i] - c.fp32[i]);\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>5</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>5</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Fused Multiply-Add", "extension": "LASX", "display": true}, {"name": "__m256 __lasx_xvfrecip_s (__m256 a)", "markdown": "### Synopsis\n\n```c++\n__m256 __lasx_xvfrecip_s (__m256 a)\n#include <lasxintrin.h>\nInstruction: xvfrecip.s xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompute reciprocal of single precision floating point elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.fp32[i] = 1 / a.fp32[i];\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 27 | 0.18(1/5.5) |\n| 3C5000 | 27 | 0.14(1/7) |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256 __lasx_xvfrecip_s (__m256 a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfrecip.s xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute reciprocal of single precision floating point elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.fp32[i] = 1 / a.fp32[i];\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>27</td>\n<td>0.18(1/5.5)</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>27</td>\n<td>0.14(1/7)</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Computation", "extension": "LASX", "display": true}, {"name": "__m256 __lasx_xvfrecipe_s (__m256 a)", "markdown": "### Synopsis\n\n```c++\n__m256 __lasx_xvfrecipe_s (__m256 a)\n#include <lasxintrin.h>\nInstruction: xvfrecipe.s xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompute estimated reciprocal of single precision floating point elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.fp32[i] = 1 / a.fp32[i]; // estimated\n}\n```", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256 __lasx_xvfrecipe_s (__m256 a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfrecipe.s xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute estimated reciprocal of single precision floating point elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.fp32[i] = 1 / a.fp32[i]; // estimated\n}\n</code></pre>", "group": "Floating Point Computation", "extension": "LASX", "display": true}, {"name": "__m256 __lasx_xvfrint_s (__m256 a)", "markdown": "### Synopsis\n\n```c++\n__m256 __lasx_xvfrint_s (__m256 a)\n#include <lasxintrin.h>\nInstruction: xvfrint.s xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nRound single-precision floating point elements in `a` to integers, using current rounding mode specified in `fscr`, and store as floating point numbers.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.fp32[i] = (fp32)(s32)a.fp32[i]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256 __lasx_xvfrint_s (__m256 a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfrint.s xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Round single-precision floating point elements in <code>a</code> to integers, using current rounding mode specified in <code>fscr</code>, and store as floating point numbers.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.fp32[i] = (fp32)(s32)a.fp32[i]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Misc", "extension": "LASX", "display": true}, {"name": "__m256 __lasx_xvfrintrm_s (__m256 a)", "markdown": "### Synopsis\n\n```c++\n__m256 __lasx_xvfrintrm_s (__m256 a)\n#include <lasxintrin.h>\nInstruction: xvfrintrm.s xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nRound single-precision floating point elements in `a` to integers, rounding towards negative infinity, and store as floating point numbers.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.fp32[i] = (fp32)(s32)a.fp32[i]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256 __lasx_xvfrintrm_s (__m256 a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfrintrm.s xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Round single-precision floating point elements in <code>a</code> to integers, rounding towards negative infinity, and store as floating point numbers.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.fp32[i] = (fp32)(s32)a.fp32[i]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Misc", "extension": "LASX", "display": true}, {"name": "__m256 __lasx_xvfrintrne_s (__m256 a)", "markdown": "### Synopsis\n\n```c++\n__m256 __lasx_xvfrintrne_s (__m256 a)\n#include <lasxintrin.h>\nInstruction: xvfrintrne.s xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nRound single-precision floating point elements in `a` to integers, rounding towards nearest even, and store as floating point numbers.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.fp32[i] = (fp32)(s32)a.fp32[i]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256 __lasx_xvfrintrne_s (__m256 a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfrintrne.s xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Round single-precision floating point elements in <code>a</code> to integers, rounding towards nearest even, and store as floating point numbers.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.fp32[i] = (fp32)(s32)a.fp32[i]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Misc", "extension": "LASX", "display": true}, {"name": "__m256 __lasx_xvfrintrp_s (__m256 a)", "markdown": "### Synopsis\n\n```c++\n__m256 __lasx_xvfrintrp_s (__m256 a)\n#include <lasxintrin.h>\nInstruction: xvfrintrp.s xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nRound single-precision floating point elements in `a` to integers, rounding towards positive infinity, and store as floating point numbers.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.fp32[i] = (fp32)(s32)a.fp32[i]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256 __lasx_xvfrintrp_s (__m256 a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfrintrp.s xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Round single-precision floating point elements in <code>a</code> to integers, rounding towards positive infinity, and store as floating point numbers.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.fp32[i] = (fp32)(s32)a.fp32[i]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Misc", "extension": "LASX", "display": true}, {"name": "__m256 __lasx_xvfrintrz_s (__m256 a)", "markdown": "### Synopsis\n\n```c++\n__m256 __lasx_xvfrintrz_s (__m256 a)\n#include <lasxintrin.h>\nInstruction: xvfrintrz.s xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nRound single-precision floating point elements in `a` to integers, rounding towards zero, and store as floating point numbers.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.fp32[i] = (fp32)(s32)a.fp32[i]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256 __lasx_xvfrintrz_s (__m256 a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfrintrz.s xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Round single-precision floating point elements in <code>a</code> to integers, rounding towards zero, and store as floating point numbers.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.fp32[i] = (fp32)(s32)a.fp32[i]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Misc", "extension": "LASX", "display": true}, {"name": "__m256 __lasx_xvfrsqrt_s (__m256 a)", "markdown": "### Synopsis\n\n```c++\n__m256 __lasx_xvfrsqrt_s (__m256 a)\n#include <lasxintrin.h>\nInstruction: xvfrsqrt.s xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompute reciprocal of square root of single precision floating point elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.fp32[i] = 1.0 / sqrt(a.fp32[i]);\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 25 | 0.05(1/19) |\n| 3C5000 | 25 | 0.03(1/32) |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256 __lasx_xvfrsqrt_s (__m256 a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfrsqrt.s xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute reciprocal of square root of single precision floating point elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.fp32[i] = 1.0 / sqrt(a.fp32[i]);\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>25</td>\n<td>0.05(1/19)</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>25</td>\n<td>0.03(1/32)</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Computation", "extension": "LASX", "display": true}, {"name": "__m256 __lasx_xvfrsqrte_s (__m256 a)", "markdown": "### Synopsis\n\n```c++\n__m256 __lasx_xvfrsqrte_s (__m256 a)\n#include <lasxintrin.h>\nInstruction: xvfrsqrte.s xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompute estimated reciprocal of square root of single precision floating point elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.fp32[i] = 1.0 / sqrt(a.fp32[i]); // estimated\n}\n```", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256 __lasx_xvfrsqrte_s (__m256 a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfrsqrte.s xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute estimated reciprocal of square root of single precision floating point elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.fp32[i] = 1.0 / sqrt(a.fp32[i]); // estimated\n}\n</code></pre>", "group": "Floating Point Computation", "extension": "LASX", "display": true}, {"name": "__m256 __lasx_xvfsqrt_s (__m256 a)", "markdown": "### Synopsis\n\n```c++\n__m256 __lasx_xvfsqrt_s (__m256 a)\n#include <lasxintrin.h>\nInstruction: xvfsqrt.s xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompute square root of single precision floating point elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.fp32[i] = sqrt(a.fp32[i]);\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 15 | 0.08(1/12) |\n| 3C5000 | 15 | 0.07(1/13.5) |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256 __lasx_xvfsqrt_s (__m256 a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfsqrt.s xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute square root of single precision floating point elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.fp32[i] = sqrt(a.fp32[i]);\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>15</td>\n<td>0.08(1/12)</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>15</td>\n<td>0.07(1/13.5)</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Computation", "extension": "LASX", "display": true}, {"name": "__m256 __lasx_xvfsub_s (__m256 a, __m256 b)", "markdown": "### Synopsis\n\n```c++\n__m256 __lasx_xvfsub_s (__m256 a, __m256 b)\n#include <lasxintrin.h>\nInstruction: xvfsub.s xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nSubtract single precision floating point elements in `a` by elements in `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.fp32[i] = a.fp32[i] - b.fp32[i];\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 4 |\n| 3C5000 | 5 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256 __lasx_xvfsub_s (__m256 a, __m256 b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfsub.s xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Subtract single precision floating point elements in <code>a</code> by elements in <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.fp32[i] = a.fp32[i] - b.fp32[i];\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>5</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Computation", "extension": "LASX", "display": true}, {"name": "__m256 __lasx_xvpickve_w_f (__m256 a, imm0_7 imm)", "markdown": "### Synopsis\n\n```c++\n__m256 __lasx_xvpickve_w_f (__m256 a, imm0_7 imm)\n#include <lasxintrin.h>\nInstruction: xvpickve.w xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nCopy one 32-bit lane from `a` specified by `imm` to the first lane of `dst`, and set the other lanes to zero.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = (i == 0) ? a.word[imm] : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 4 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256 __lasx_xvpickve_w_f (__m256 a, imm0_7 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvpickve.w xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Copy one 32-bit lane from <code>a</code> specified by <code>imm</code> to the first lane of <code>dst</code>, and set the other lanes to zero.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = (i == 0) ? a.word[imm] : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256d __lasx_xvfadd_d (__m256d a, __m256d b)", "markdown": "### Synopsis\n\n```c++\n__m256d __lasx_xvfadd_d (__m256d a, __m256d b)\n#include <lasxintrin.h>\nInstruction: xvfadd.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nAdd double precision floating point elements in `a` to elements in `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.fp64[i] = a.fp64[i] + b.fp64[i];\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 4 |\n| 3C5000 | 5 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256d __lasx_xvfadd_d (__m256d a, __m256d b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfadd.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add double precision floating point elements in <code>a</code> to elements in <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.fp64[i] = a.fp64[i] + b.fp64[i];\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>5</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Computation", "extension": "LASX", "display": true}, {"name": "__m256d __lasx_xvfcvth_d_s (__m256 a)", "markdown": "### Synopsis\n\n```c++\n__m256d __lasx_xvfcvth_d_s (__m256 a)\n#include <lasxintrin.h>\nInstruction: xvfcvth.d.s xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nConvert single precision floating point elements in higher half of `a` to double precision.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.fp64[i] = a.fp32[4 + i];\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256d __lasx_xvfcvth_d_s (__m256 a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfcvth.d.s xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Convert single precision floating point elements in higher half of <code>a</code> to double precision.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.fp64[i] = a.fp32[4 + i];\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Conversion", "extension": "LASX", "display": true}, {"name": "__m256d __lasx_xvfcvtl_d_s (__m256 a)", "markdown": "### Synopsis\n\n```c++\n__m256d __lasx_xvfcvtl_d_s (__m256 a)\n#include <lasxintrin.h>\nInstruction: xvfcvtl.d.s xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nConvert single precision floating point elements in lower half of `a` to double precision.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.fp64[i] = a.fp32[i];\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256d __lasx_xvfcvtl_d_s (__m256 a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfcvtl.d.s xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Convert single precision floating point elements in lower half of <code>a</code> to double precision.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.fp64[i] = a.fp32[i];\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Conversion", "extension": "LASX", "display": true}, {"name": "__m256d __lasx_xvfdiv_d (__m256d a, __m256d b)", "markdown": "### Synopsis\n\n```c++\n__m256d __lasx_xvfdiv_d (__m256d a, __m256d b)\n#include <lasxintrin.h>\nInstruction: xvfdiv.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nDivide double precision floating point elements in `a` by elements in `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.fp64[i] = a.fp64[i] / b.fp64[i];\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 8, 21.5 | 0.25(1/4) |\n| 3C5000 | 8, 17 | 0.08(1/12.5) |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256d __lasx_xvfdiv_d (__m256d a, __m256d b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfdiv.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Divide double precision floating point elements in <code>a</code> by elements in <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.fp64[i] = a.fp64[i] / b.fp64[i];\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>8, 21.5</td>\n<td>0.25(1/4)</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>8, 17</td>\n<td>0.08(1/12.5)</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Computation", "extension": "LASX", "display": true}, {"name": "__m256d __lasx_xvffint_d_l (__m256i a)", "markdown": "### Synopsis\n\n```c++\n__m256d __lasx_xvffint_d_l (__m256i a)\n#include <lasxintrin.h>\nInstruction: xvffint.d.l xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nConvert signed 64-bit integer elements in `a` to double-precision floating point numbers.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.fp64[i] = (f64)(s64)a.dword[i]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 4 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256d __lasx_xvffint_d_l (__m256i a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvffint.d.l xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Convert signed 64-bit integer elements in <code>a</code> to double-precision floating point numbers.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.fp64[i] = (f64)(s64)a.dword[i]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Conversion", "extension": "LASX", "display": true}, {"name": "__m256d __lasx_xvffint_d_lu (__m256i a)", "markdown": "### Synopsis\n\n```c++\n__m256d __lasx_xvffint_d_lu (__m256i a)\n#include <lasxintrin.h>\nInstruction: xvffint.d.lu xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nConvert unsigned 64-bit integer elements in `a` to double-precision floating point numbers.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.fp64[i] = (f64)(u64)a.dword[i]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 4 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256d __lasx_xvffint_d_lu (__m256i a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvffint.d.lu xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Convert unsigned 64-bit integer elements in <code>a</code> to double-precision floating point numbers.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.fp64[i] = (f64)(u64)a.dword[i]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Conversion", "extension": "LASX", "display": true}, {"name": "__m256d __lasx_xvffinth_d_w (__m256i a)", "markdown": "### Synopsis\n\n```c++\n__m256d __lasx_xvffinth_d_w (__m256i a)\n#include <lasxintrin.h>\nInstruction: xvffinth.d.w xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nConvert 32-bit integer elements in higher part of `a` to double precision floating point numbers.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.fp64[i] = (f64)(s32)a.word[i + 4]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 5 | 2 |\n| 3C5000 | 5 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256d __lasx_xvffinth_d_w (__m256i a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvffinth.d.w xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Convert 32-bit integer elements in higher part of <code>a</code> to double precision floating point numbers.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.fp64[i] = (f64)(s32)a.word[i + 4]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>5</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>5</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Conversion", "extension": "LASX", "display": true}, {"name": "__m256d __lasx_xvffintl_d_w (__m256i a)", "markdown": "### Synopsis\n\n```c++\n__m256d __lasx_xvffintl_d_w (__m256i a)\n#include <lasxintrin.h>\nInstruction: xvffintl.d.w xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nConvert 32-bit integer elements in lower part of `a` to double precision floating point numbers.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.fp64[i] = (f64)(s32)a.word[i]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 5 | 2 |\n| 3C5000 | 5 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256d __lasx_xvffintl_d_w (__m256i a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvffintl.d.w xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Convert 32-bit integer elements in lower part of <code>a</code> to double precision floating point numbers.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.fp64[i] = (f64)(s32)a.word[i]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>5</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>5</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Conversion", "extension": "LASX", "display": true}, {"name": "__m256d __lasx_xvflogb_d (__m256d a)", "markdown": "### Synopsis\n\n```c++\n__m256d __lasx_xvflogb_d (__m256d a)\n#include <lasxintrin.h>\nInstruction: xvflogb.d xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompute 2-based logarithm of double precision floating point elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.fp64[i] = log2(a.fp64[i]);\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 4 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256d __lasx_xvflogb_d (__m256d a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvflogb.d xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute 2-based logarithm of double precision floating point elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.fp64[i] = log2(a.fp64[i]);\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Computation", "extension": "LASX", "display": true}, {"name": "__m256d __lasx_xvfmadd_d (__m256d a, __m256d b, __m256d c)", "markdown": "### Synopsis\n\n```c++\n__m256d __lasx_xvfmadd_d (__m256d a, __m256d b, __m256d c)\n#include <lasxintrin.h>\nInstruction: xvfmadd.d xr, xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompute packed double precision floating point FMA(Fused Multiply-Add): multiply elements in `a` and `b`, accumulate to elements in `c` and store the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.fp64[i] = a.fp64[i] * b.fp64[i] + c.fp64[i];\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 5 | 2 |\n| 3C5000 | 5 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256d __lasx_xvfmadd_d (__m256d a, __m256d b, __m256d c)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfmadd.d xr, xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute packed double precision floating point FMA(Fused Multiply-Add): multiply elements in <code>a</code> and <code>b</code>, accumulate to elements in <code>c</code> and store the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.fp64[i] = a.fp64[i] * b.fp64[i] + c.fp64[i];\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>5</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>5</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Fused Multiply-Add", "extension": "LASX", "display": true}, {"name": "__m256d __lasx_xvfmax_d (__m256d a, __m256d b)", "markdown": "### Synopsis\n\n```c++\n__m256d __lasx_xvfmax_d (__m256d a, __m256d b)\n#include <lasxintrin.h>\nInstruction: xvfmax.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompute maximum of double precision floating point elements in `a` and `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.fp64[i] = fmax(a.fp64[i], b.fp64[i]);\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256d __lasx_xvfmax_d (__m256d a, __m256d b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfmax.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute maximum of double precision floating point elements in <code>a</code> and <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.fp64[i] = fmax(a.fp64[i], b.fp64[i]);\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Computation", "extension": "LASX", "display": true}, {"name": "__m256d __lasx_xvfmaxa_d (__m256d a, __m256d b)", "markdown": "### Synopsis\n\n```c++\n__m256d __lasx_xvfmaxa_d (__m256d a, __m256d b)\n#include <lasxintrin.h>\nInstruction: xvfmaxa.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompute maximum of double precision floating point elements in `a` and `b` by magnitude.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.fp64[i] = (abs(a.fp64[i]) > abs(b.fp64[i])) ? a.fp64[i] : b.fp64[i];\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256d __lasx_xvfmaxa_d (__m256d a, __m256d b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfmaxa.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute maximum of double precision floating point elements in <code>a</code> and <code>b</code> by magnitude.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.fp64[i] = (abs(a.fp64[i]) &gt; abs(b.fp64[i])) ? a.fp64[i] : b.fp64[i];\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Computation", "extension": "LASX", "display": true}, {"name": "__m256d __lasx_xvfmin_d (__m256d a, __m256d b)", "markdown": "### Synopsis\n\n```c++\n__m256d __lasx_xvfmin_d (__m256d a, __m256d b)\n#include <lasxintrin.h>\nInstruction: xvfmax.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompute minimum of double precision floating point elements in `a` and `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.fp64[i] = fmin(a.fp64[i], b.fp64[i]);\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256d __lasx_xvfmin_d (__m256d a, __m256d b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfmax.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute minimum of double precision floating point elements in <code>a</code> and <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.fp64[i] = fmin(a.fp64[i], b.fp64[i]);\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Computation", "extension": "LASX", "display": true}, {"name": "__m256d __lasx_xvfmina_d (__m256d a, __m256d b)", "markdown": "### Synopsis\n\n```c++\n__m256d __lasx_xvfmina_d (__m256d a, __m256d b)\n#include <lasxintrin.h>\nInstruction: xvfmina.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompute minimum of double precision floating point elements in `a` and `b` by magnitude.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.fp64[i] = (abs(a.fp64[i]) < abs(b.fp64[i])) ? a.fp64[i] : b.fp64[i];\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256d __lasx_xvfmina_d (__m256d a, __m256d b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfmina.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute minimum of double precision floating point elements in <code>a</code> and <code>b</code> by magnitude.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.fp64[i] = (abs(a.fp64[i]) &lt; abs(b.fp64[i])) ? a.fp64[i] : b.fp64[i];\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Computation", "extension": "LASX", "display": true}, {"name": "__m256d __lasx_xvfmsub_d (__m256d a, __m256d b, __m256d c)", "markdown": "### Synopsis\n\n```c++\n__m256d __lasx_xvfmsub_d (__m256d a, __m256d b, __m256d c)\n#include <lasxintrin.h>\nInstruction: xvfmsub.d xr, xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompute packed double precision floating point FMA(Fused Multiply-Add): multiply elements in `a` and `b`, subtract elements in `c` and store the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.fp64[i] = a.fp64[i] * b.fp64[i] - c.fp64[i];\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 5 | 2 |\n| 3C5000 | 5 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256d __lasx_xvfmsub_d (__m256d a, __m256d b, __m256d c)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfmsub.d xr, xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute packed double precision floating point FMA(Fused Multiply-Add): multiply elements in <code>a</code> and <code>b</code>, subtract elements in <code>c</code> and store the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.fp64[i] = a.fp64[i] * b.fp64[i] - c.fp64[i];\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>5</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>5</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Fused Multiply-Add", "extension": "LASX", "display": true}, {"name": "__m256d __lasx_xvfmul_d (__m256d a, __m256d b)", "markdown": "### Synopsis\n\n```c++\n__m256d __lasx_xvfmul_d (__m256d a, __m256d b)\n#include <lasxintrin.h>\nInstruction: xvfmul.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nMultiply double precision floating point elements in `a` and elements in `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.fp64[i] = a.fp64[i] * b.fp64[i];\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 5 | 2 |\n| 3C5000 | 5 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256d __lasx_xvfmul_d (__m256d a, __m256d b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfmul.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply double precision floating point elements in <code>a</code> and elements in <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.fp64[i] = a.fp64[i] * b.fp64[i];\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>5</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>5</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Computation", "extension": "LASX", "display": true}, {"name": "__m256d __lasx_xvfnmadd_d (__m256d a, __m256d b, __m256d c)", "markdown": "### Synopsis\n\n```c++\n__m256d __lasx_xvfnmadd_d (__m256d a, __m256d b, __m256d c)\n#include <lasxintrin.h>\nInstruction: xvfnmadd.d xr, xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompute packed double precision floating point FMA(Fused Multiply-Add): multiply elements in `a` and `b`, accumulate to elements in `c` and store the negated result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.fp64[i] = (a.fp64[i] * b.fp64[i] + c.fp64[i]);\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 5 | 2 |\n| 3C5000 | 5 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256d __lasx_xvfnmadd_d (__m256d a, __m256d b, __m256d c)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfnmadd.d xr, xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute packed double precision floating point FMA(Fused Multiply-Add): multiply elements in <code>a</code> and <code>b</code>, accumulate to elements in <code>c</code> and store the negated result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.fp64[i] = (a.fp64[i] * b.fp64[i] + c.fp64[i]);\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>5</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>5</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Fused Multiply-Add", "extension": "LASX", "display": true}, {"name": "__m256d __lasx_xvfnmsub_d (__m256d a, __m256d b, __m256d c)", "markdown": "### Synopsis\n\n```c++\n__m256d __lasx_xvfnmsub_d (__m256d a, __m256d b, __m256d c)\n#include <lasxintrin.h>\nInstruction: xvfnmsub.d xr, xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompute packed double precision floating point FMA(Fused Multiply-Add): multiply elements in `a` and `b`, subtract elements in `c` and store the negated result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.fp64[i] = -(a.fp64[i] * b.fp64[i] - c.fp64[i]);\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 5 | 2 |\n| 3C5000 | 5 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256d __lasx_xvfnmsub_d (__m256d a, __m256d b, __m256d c)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfnmsub.d xr, xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute packed double precision floating point FMA(Fused Multiply-Add): multiply elements in <code>a</code> and <code>b</code>, subtract elements in <code>c</code> and store the negated result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.fp64[i] = -(a.fp64[i] * b.fp64[i] - c.fp64[i]);\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>5</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>5</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Fused Multiply-Add", "extension": "LASX", "display": true}, {"name": "__m256d __lasx_xvfrecip_d (__m256d a)", "markdown": "### Synopsis\n\n```c++\n__m256d __lasx_xvfrecip_d (__m256d a)\n#include <lasxintrin.h>\nInstruction: xvfrecip.d xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompute reciprocal of double precision floating point elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.fp64[i] = 1 / a.fp64[i];\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 23 | 0.25(1/4) |\n| 3C5000 | 23 | 0.08(1/12) |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256d __lasx_xvfrecip_d (__m256d a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfrecip.d xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute reciprocal of double precision floating point elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.fp64[i] = 1 / a.fp64[i];\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>23</td>\n<td>0.25(1/4)</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>23</td>\n<td>0.08(1/12)</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Computation", "extension": "LASX", "display": true}, {"name": "__m256d __lasx_xvfrecipe_d (__m256d a)", "markdown": "### Synopsis\n\n```c++\n__m256d __lasx_xvfrecipe_d (__m256d a)\n#include <lasxintrin.h>\nInstruction: xvfrecipe.d xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompute estimated reciprocal of double precision floating point elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.fp64[i] = 1 / a.fp64[i]; // estimated\n}\n```", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256d __lasx_xvfrecipe_d (__m256d a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfrecipe.d xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute estimated reciprocal of double precision floating point elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.fp64[i] = 1 / a.fp64[i]; // estimated\n}\n</code></pre>", "group": "Floating Point Computation", "extension": "LASX", "display": true}, {"name": "__m256d __lasx_xvfrint_d (__m256d a)", "markdown": "### Synopsis\n\n```c++\n__m256d __lasx_xvfrint_d (__m256d a)\n#include <lasxintrin.h>\nInstruction: xvfrint.d xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nRound single-precision floating point elements in `a` to integers, using current rounding mode specified in `fscr`, and store as floating point numbers.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.fp64[i] = (fp64)(s64)a.fp64[i]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256d __lasx_xvfrint_d (__m256d a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfrint.d xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Round single-precision floating point elements in <code>a</code> to integers, using current rounding mode specified in <code>fscr</code>, and store as floating point numbers.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.fp64[i] = (fp64)(s64)a.fp64[i]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Misc", "extension": "LASX", "display": true}, {"name": "__m256d __lasx_xvfrintrm_d (__m256d a)", "markdown": "### Synopsis\n\n```c++\n__m256d __lasx_xvfrintrm_d (__m256d a)\n#include <lasxintrin.h>\nInstruction: xvfrintrm.d xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nRound single-precision floating point elements in `a` to integers, rounding towards negative infinity, and store as floating point numbers.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.fp64[i] = (fp64)(s64)a.fp64[i]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256d __lasx_xvfrintrm_d (__m256d a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfrintrm.d xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Round single-precision floating point elements in <code>a</code> to integers, rounding towards negative infinity, and store as floating point numbers.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.fp64[i] = (fp64)(s64)a.fp64[i]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Misc", "extension": "LASX", "display": true}, {"name": "__m256d __lasx_xvfrintrne_d (__m256d a)", "markdown": "### Synopsis\n\n```c++\n__m256d __lasx_xvfrintrne_d (__m256d a)\n#include <lasxintrin.h>\nInstruction: xvfrintrne.d xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nRound single-precision floating point elements in `a` to integers, rounding towards nearest even, and store as floating point numbers.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.fp64[i] = (fp64)(s64)a.fp64[i]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256d __lasx_xvfrintrne_d (__m256d a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfrintrne.d xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Round single-precision floating point elements in <code>a</code> to integers, rounding towards nearest even, and store as floating point numbers.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.fp64[i] = (fp64)(s64)a.fp64[i]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Misc", "extension": "LASX", "display": true}, {"name": "__m256d __lasx_xvfrintrp_d (__m256d a)", "markdown": "### Synopsis\n\n```c++\n__m256d __lasx_xvfrintrp_d (__m256d a)\n#include <lasxintrin.h>\nInstruction: xvfrintrp.d xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nRound single-precision floating point elements in `a` to integers, rounding towards positive infinity, and store as floating point numbers.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.fp64[i] = (fp64)(s64)a.fp64[i]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256d __lasx_xvfrintrp_d (__m256d a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfrintrp.d xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Round single-precision floating point elements in <code>a</code> to integers, rounding towards positive infinity, and store as floating point numbers.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.fp64[i] = (fp64)(s64)a.fp64[i]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Misc", "extension": "LASX", "display": true}, {"name": "__m256d __lasx_xvfrintrz_d (__m256d a)", "markdown": "### Synopsis\n\n```c++\n__m256d __lasx_xvfrintrz_d (__m256d a)\n#include <lasxintrin.h>\nInstruction: xvfrintrz.d xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nRound single-precision floating point elements in `a` to integers, rounding towards zero, and store as floating point numbers.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.fp64[i] = (fp64)(s64)a.fp64[i]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256d __lasx_xvfrintrz_d (__m256d a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfrintrz.d xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Round single-precision floating point elements in <code>a</code> to integers, rounding towards zero, and store as floating point numbers.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.fp64[i] = (fp64)(s64)a.fp64[i]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Misc", "extension": "LASX", "display": true}, {"name": "__m256d __lasx_xvfrsqrt_d (__m256d a)", "markdown": "### Synopsis\n\n```c++\n__m256d __lasx_xvfrsqrt_d (__m256d a)\n#include <lasxintrin.h>\nInstruction: xvfrsqrt.d xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompute reciprocal of square root of double precision floating point elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.fp64[i] = 1.0 / sqrt(a.fp64[i]);\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 15 | 0.04(1/26.5) |\n| 3C5000 | 15 | 0.04(1/27.5) |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256d __lasx_xvfrsqrt_d (__m256d a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfrsqrt.d xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute reciprocal of square root of double precision floating point elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.fp64[i] = 1.0 / sqrt(a.fp64[i]);\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>15</td>\n<td>0.04(1/26.5)</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>15</td>\n<td>0.04(1/27.5)</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Computation", "extension": "LASX", "display": true}, {"name": "__m256d __lasx_xvfrsqrte_d (__m256d a)", "markdown": "### Synopsis\n\n```c++\n__m256d __lasx_xvfrsqrte_d (__m256d a)\n#include <lasxintrin.h>\nInstruction: xvfrsqrte.d xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompute estimated reciprocal of square root of double precision floating point elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.fp64[i] = 1.0 / sqrt(a.fp64[i]); // estimated\n}\n```", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256d __lasx_xvfrsqrte_d (__m256d a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfrsqrte.d xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute estimated reciprocal of square root of double precision floating point elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.fp64[i] = 1.0 / sqrt(a.fp64[i]); // estimated\n}\n</code></pre>", "group": "Floating Point Computation", "extension": "LASX", "display": true}, {"name": "__m256d __lasx_xvfsqrt_d (__m256d a)", "markdown": "### Synopsis\n\n```c++\n__m256d __lasx_xvfsqrt_d (__m256d a)\n#include <lasxintrin.h>\nInstruction: xvfsqrt.d xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompute square root of double precision floating point elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.fp64[i] = sqrt(a.fp64[i]);\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 36 | 0.06(1/17.5) |\n| 3C5000 | 36 | 0.05(1/18.5) |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256d __lasx_xvfsqrt_d (__m256d a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfsqrt.d xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute square root of double precision floating point elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.fp64[i] = sqrt(a.fp64[i]);\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>36</td>\n<td>0.06(1/17.5)</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>36</td>\n<td>0.05(1/18.5)</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Computation", "extension": "LASX", "display": true}, {"name": "__m256d __lasx_xvfsub_d (__m256d a, __m256d b)", "markdown": "### Synopsis\n\n```c++\n__m256d __lasx_xvfsub_d (__m256d a, __m256d b)\n#include <lasxintrin.h>\nInstruction: xvfsub.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nSubtract double precision floating point elements in `a` by elements in `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.fp64[i] = a.fp64[i] - b.fp64[i];\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 4 |\n| 3C5000 | 5 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256d __lasx_xvfsub_d (__m256d a, __m256d b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfsub.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Subtract double precision floating point elements in <code>a</code> by elements in <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.fp64[i] = a.fp64[i] - b.fp64[i];\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>5</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Computation", "extension": "LASX", "display": true}, {"name": "__m256d __lasx_xvpickve_d_f (__m256d a, imm0_3 imm)", "markdown": "### Synopsis\n\n```c++\n__m256d __lasx_xvpickve_d_f (__m256d a, imm0_3 imm)\n#include <lasxintrin.h>\nInstruction: xvpickve.d xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nCopy one 64-bit lane from `a` specified by `imm` to the first lane of `dst`, and set the other lanes to zero.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = (i == 0) ? a.dword[imm] : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 4 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256d __lasx_xvpickve_d_f (__m256d a, imm0_3 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvpickve.d xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Copy one 64-bit lane from <code>a</code> specified by <code>imm</code> to the first lane of <code>dst</code>, and set the other lanes to zero.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = (i == 0) ? a.dword[imm] : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_vext2xv_d_b (__m256i a)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_vext2xv_d_b (__m256i a)\n#include <lsxintrin.h>\nInstruction: vext2xv.d.b xr, xr\nCPU Flags: LSX\n```\n\n### Description\n\nExtend signed 8-bit lane of `a` to signed 64-bit elements.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = (s64)(s8)a.byte[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 4 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_vext2xv_d_b (__m256i a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vext2xv.d.b xr, xr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Extend signed 8-bit lane of <code>a</code> to signed 64-bit elements.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = (s64)(s8)a.byte[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_vext2xv_d_h (__m256i a)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_vext2xv_d_h (__m256i a)\n#include <lsxintrin.h>\nInstruction: vext2xv.d.h xr, xr\nCPU Flags: LSX\n```\n\n### Description\n\nExtend signed 16-bit lane of `a` to signed 64-bit elements.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = (s64)(s16)a.half[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 4 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_vext2xv_d_h (__m256i a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vext2xv.d.h xr, xr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Extend signed 16-bit lane of <code>a</code> to signed 64-bit elements.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = (s64)(s16)a.half[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_vext2xv_d_w (__m256i a)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_vext2xv_d_w (__m256i a)\n#include <lsxintrin.h>\nInstruction: vext2xv.d.w xr, xr\nCPU Flags: LSX\n```\n\n### Description\n\nExtend signed 32-bit lane of `a` to signed 64-bit elements.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = (s64)(s32)a.word[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 4 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_vext2xv_d_w (__m256i a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vext2xv.d.w xr, xr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Extend signed 32-bit lane of <code>a</code> to signed 64-bit elements.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = (s64)(s32)a.word[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_vext2xv_du_bu (__m256i a)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_vext2xv_du_bu (__m256i a)\n#include <lsxintrin.h>\nInstruction: vext2xv.du.bu xr, xr\nCPU Flags: LSX\n```\n\n### Description\n\nExtend unsigned 8-bit lane of `a` to unsigned 64-bit elements.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = (u64)(u8)a.byte[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 4 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_vext2xv_du_bu (__m256i a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vext2xv.du.bu xr, xr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Extend unsigned 8-bit lane of <code>a</code> to unsigned 64-bit elements.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = (u64)(u8)a.byte[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_vext2xv_du_hu (__m256i a)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_vext2xv_du_hu (__m256i a)\n#include <lsxintrin.h>\nInstruction: vext2xv.du.hu xr, xr\nCPU Flags: LSX\n```\n\n### Description\n\nExtend unsigned 16-bit lane of `a` to unsigned 64-bit elements.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = (u64)(u16)a.half[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 4 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_vext2xv_du_hu (__m256i a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vext2xv.du.hu xr, xr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Extend unsigned 16-bit lane of <code>a</code> to unsigned 64-bit elements.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = (u64)(u16)a.half[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_vext2xv_du_wu (__m256i a)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_vext2xv_du_wu (__m256i a)\n#include <lsxintrin.h>\nInstruction: vext2xv.du.wu xr, xr\nCPU Flags: LSX\n```\n\n### Description\n\nExtend unsigned 32-bit lane of `a` to unsigned 64-bit elements.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = (u64)(u32)a.word[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 4 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_vext2xv_du_wu (__m256i a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vext2xv.du.wu xr, xr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Extend unsigned 32-bit lane of <code>a</code> to unsigned 64-bit elements.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = (u64)(u32)a.word[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_vext2xv_h_b (__m256i a)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_vext2xv_h_b (__m256i a)\n#include <lsxintrin.h>\nInstruction: vext2xv.h.b xr, xr\nCPU Flags: LSX\n```\n\n### Description\n\nExtend signed 8-bit lane of `a` to signed 16-bit elements.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = (s16)(s8)a.byte[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 4 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_vext2xv_h_b (__m256i a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vext2xv.h.b xr, xr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Extend signed 8-bit lane of <code>a</code> to signed 16-bit elements.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = (s16)(s8)a.byte[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_vext2xv_hu_bu (__m256i a)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_vext2xv_hu_bu (__m256i a)\n#include <lsxintrin.h>\nInstruction: vext2xv.hu.bu xr, xr\nCPU Flags: LSX\n```\n\n### Description\n\nExtend unsigned 8-bit lane of `a` to unsigned 16-bit elements.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = (u16)(u8)a.byte[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 4 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_vext2xv_hu_bu (__m256i a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vext2xv.hu.bu xr, xr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Extend unsigned 8-bit lane of <code>a</code> to unsigned 16-bit elements.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = (u16)(u8)a.byte[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_vext2xv_w_b (__m256i a)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_vext2xv_w_b (__m256i a)\n#include <lsxintrin.h>\nInstruction: vext2xv.w.b xr, xr\nCPU Flags: LSX\n```\n\n### Description\n\nExtend signed 8-bit lane of `a` to signed 32-bit elements.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = (s32)(s8)a.byte[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 4 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_vext2xv_w_b (__m256i a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vext2xv.w.b xr, xr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Extend signed 8-bit lane of <code>a</code> to signed 32-bit elements.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = (s32)(s8)a.byte[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_vext2xv_w_h (__m256i a)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_vext2xv_w_h (__m256i a)\n#include <lsxintrin.h>\nInstruction: vext2xv.w.h xr, xr\nCPU Flags: LSX\n```\n\n### Description\n\nExtend signed 16-bit lane of `a` to signed 32-bit elements.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = (s32)(s16)a.half[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 4 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_vext2xv_w_h (__m256i a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vext2xv.w.h xr, xr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Extend signed 16-bit lane of <code>a</code> to signed 32-bit elements.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = (s32)(s16)a.half[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_vext2xv_wu_bu (__m256i a)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_vext2xv_wu_bu (__m256i a)\n#include <lsxintrin.h>\nInstruction: vext2xv.wu.bu xr, xr\nCPU Flags: LSX\n```\n\n### Description\n\nExtend unsigned 8-bit lane of `a` to unsigned 32-bit elements.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = (u32)(u8)a.byte[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 4 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_vext2xv_wu_bu (__m256i a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vext2xv.wu.bu xr, xr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Extend unsigned 8-bit lane of <code>a</code> to unsigned 32-bit elements.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = (u32)(u8)a.byte[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_vext2xv_wu_hu (__m256i a)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_vext2xv_wu_hu (__m256i a)\n#include <lsxintrin.h>\nInstruction: vext2xv.wu.hu xr, xr\nCPU Flags: LSX\n```\n\n### Description\n\nExtend unsigned 16-bit lane of `a` to unsigned 32-bit elements.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = (u32)(u16)a.half[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 4 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_vext2xv_wu_hu (__m256i a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vext2xv.wu.hu xr, xr\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Extend unsigned 16-bit lane of <code>a</code> to unsigned 32-bit elements.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = (u32)(u16)a.half[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvabsd_b (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvabsd_b (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvabsd.b xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompute absolute difference of signed 8-bit elements in `a` and `b`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 32; i++) {\n  dst.byte[i] = ((s8)a.byte[i] > (s8)b.byte[i]) ? (a.byte[i] - b.byte[i])\n                                                : (b.byte[i] - a.byte[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvabsd_b (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvabsd.b xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute absolute difference of signed 8-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 32; i++) {\n  dst.byte[i] = ((s8)a.byte[i] &gt; (s8)b.byte[i]) ? (a.byte[i] - b.byte[i])\n                                                : (b.byte[i] - a.byte[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvabsd_bu (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvabsd_bu (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvabsd.bu xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompute absolute difference of unsigned 8-bit elements in `a` and `b`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 32; i++) {\n  dst.byte[i] = ((u8)a.byte[i] > (u8)b.byte[i]) ? (a.byte[i] - b.byte[i])\n                                                : (b.byte[i] - a.byte[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvabsd_bu (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvabsd.bu xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute absolute difference of unsigned 8-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 32; i++) {\n  dst.byte[i] = ((u8)a.byte[i] &gt; (u8)b.byte[i]) ? (a.byte[i] - b.byte[i])\n                                                : (b.byte[i] - a.byte[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvabsd_d (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvabsd_d (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvabsd.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompute absolute difference of signed 64-bit elements in `a` and `b`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = ((s64)a.dword[i] > (s64)b.dword[i])\n                     ? (a.dword[i] - b.dword[i])\n                     : (b.dword[i] - a.dword[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvabsd_d (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvabsd.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute absolute difference of signed 64-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = ((s64)a.dword[i] &gt; (s64)b.dword[i])\n                     ? (a.dword[i] - b.dword[i])\n                     : (b.dword[i] - a.dword[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvabsd_du (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvabsd_du (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvabsd.du xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompute absolute difference of unsigned 64-bit elements in `a` and `b`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = ((u64)a.dword[i] > (u64)b.dword[i])\n                     ? (a.dword[i] - b.dword[i])\n                     : (b.dword[i] - a.dword[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvabsd_du (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvabsd.du xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute absolute difference of unsigned 64-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = ((u64)a.dword[i] &gt; (u64)b.dword[i])\n                     ? (a.dword[i] - b.dword[i])\n                     : (b.dword[i] - a.dword[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvabsd_h (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvabsd_h (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvabsd.h xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompute absolute difference of signed 16-bit elements in `a` and `b`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = ((s16)a.half[i] > (s16)b.half[i]) ? (a.half[i] - b.half[i])\n                                                  : (b.half[i] - a.half[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvabsd_h (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvabsd.h xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute absolute difference of signed 16-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = ((s16)a.half[i] &gt; (s16)b.half[i]) ? (a.half[i] - b.half[i])\n                                                  : (b.half[i] - a.half[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvabsd_hu (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvabsd_hu (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvabsd.hu xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompute absolute difference of unsigned 16-bit elements in `a` and `b`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = ((u16)a.half[i] > (u16)b.half[i]) ? (a.half[i] - b.half[i])\n                                                  : (b.half[i] - a.half[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvabsd_hu (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvabsd.hu xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute absolute difference of unsigned 16-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = ((u16)a.half[i] &gt; (u16)b.half[i]) ? (a.half[i] - b.half[i])\n                                                  : (b.half[i] - a.half[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvabsd_w (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvabsd_w (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvabsd.w xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompute absolute difference of signed 32-bit elements in `a` and `b`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = ((s32)a.word[i] > (s32)b.word[i]) ? (a.word[i] - b.word[i])\n                                                  : (b.word[i] - a.word[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvabsd_w (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvabsd.w xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute absolute difference of signed 32-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = ((s32)a.word[i] &gt; (s32)b.word[i]) ? (a.word[i] - b.word[i])\n                                                  : (b.word[i] - a.word[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvabsd_wu (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvabsd_wu (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvabsd.wu xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompute absolute difference of unsigned 32-bit elements in `a` and `b`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = ((u32)a.word[i] > (u32)b.word[i]) ? (a.word[i] - b.word[i])\n                                                  : (b.word[i] - a.word[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvabsd_wu (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvabsd.wu xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute absolute difference of unsigned 32-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = ((u32)a.word[i] &gt; (u32)b.word[i]) ? (a.word[i] - b.word[i])\n                                                  : (b.word[i] - a.word[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvadd_b (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvadd_b (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvadd.b xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nAdd 8-bit elements in `a` and `b`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 32; i++) {\n  dst.byte[i] = a.byte[i] + b.byte[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvadd_b (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvadd.b xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add 8-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 32; i++) {\n  dst.byte[i] = a.byte[i] + b.byte[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvadd_d (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvadd_d (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvadd.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nAdd 64-bit elements in `a` and `b`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = a.dword[i] + b.dword[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvadd_d (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvadd.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add 64-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = a.dword[i] + b.dword[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvadd_h (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvadd_h (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvadd.h xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nAdd 16-bit elements in `a` and `b`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = a.half[i] + b.half[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvadd_h (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvadd.h xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add 16-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = a.half[i] + b.half[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvadd_q (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvadd_q (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvadd.q xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nAdd 128-bit elements in `a` and `b`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.qword[i] = a.qword[i] + b.qword[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvadd_q (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvadd.q xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add 128-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.qword[i] = a.qword[i] + b.qword[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvadd_w (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvadd_w (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvadd.w xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nAdd 32-bit elements in `a` and `b`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = a.word[i] + b.word[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvadd_w (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvadd.w xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add 32-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = a.word[i] + b.word[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvadda_b (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvadda_b (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvadda.b xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nAdd absolute of 8-bit elements in `a` and `b`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 32; i++) {\n  dst.byte[i] = abs((s8)a.byte[i]) + abs((s8)b.byte[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvadda_b (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvadda.b xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add absolute of 8-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 32; i++) {\n  dst.byte[i] = abs((s8)a.byte[i]) + abs((s8)b.byte[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvadda_d (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvadda_d (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvadda.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nAdd absolute of 64-bit elements in `a` and `b`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = abs((s64)a.dword[i]) + abs((s64)b.dword[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvadda_d (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvadda.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add absolute of 64-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = abs((s64)a.dword[i]) + abs((s64)b.dword[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvadda_h (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvadda_h (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvadda.h xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nAdd absolute of 16-bit elements in `a` and `b`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = abs((s16)a.half[i]) + abs((s16)b.half[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvadda_h (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvadda.h xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add absolute of 16-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = abs((s16)a.half[i]) + abs((s16)b.half[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvadda_w (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvadda_w (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvadda.w xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nAdd absolute of 32-bit elements in `a` and `b`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = abs((s32)a.word[i]) + abs((s32)b.word[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvadda_w (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvadda.w xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add absolute of 32-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = abs((s32)a.word[i]) + abs((s32)b.word[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvaddi_bu (__m256i a, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvaddi_bu (__m256i a, imm0_31 imm)\n#include <lasxintrin.h>\nInstruction: xvaddi.bu xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nAdd 8-bit elements in `a` and `imm`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 32; i++) {\n  dst.byte[i] = a.byte[i] + imm;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvaddi_bu (__m256i a, imm0_31 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvaddi.bu xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add 8-bit elements in <code>a</code> and <code>imm</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 32; i++) {\n  dst.byte[i] = a.byte[i] + imm;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvaddi_du (__m256i a, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvaddi_du (__m256i a, imm0_31 imm)\n#include <lasxintrin.h>\nInstruction: xvaddi.du xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nAdd 64-bit elements in `a` and `imm`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = a.dword[i] + imm;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvaddi_du (__m256i a, imm0_31 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvaddi.du xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add 64-bit elements in <code>a</code> and <code>imm</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = a.dword[i] + imm;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvaddi_hu (__m256i a, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvaddi_hu (__m256i a, imm0_31 imm)\n#include <lasxintrin.h>\nInstruction: xvaddi.hu xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nAdd 16-bit elements in `a` and `imm`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = a.half[i] + imm;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvaddi_hu (__m256i a, imm0_31 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvaddi.hu xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add 16-bit elements in <code>a</code> and <code>imm</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = a.half[i] + imm;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvaddi_wu (__m256i a, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvaddi_wu (__m256i a, imm0_31 imm)\n#include <lasxintrin.h>\nInstruction: xvaddi.wu xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nAdd 32-bit elements in `a` and `imm`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = a.word[i] + imm;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvaddi_wu (__m256i a, imm0_31 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvaddi.wu xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add 32-bit elements in <code>a</code> and <code>imm</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = a.word[i] + imm;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvaddwev_d_w (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvaddwev_d_w (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvaddwev.d.w xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nAdd even-positioned signed 32-bit elements in `a` and signed elements in `b`, save the 64-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = (s64)(s32)a.word[2 * i] + (s64)(s32)b.word[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvaddwev_d_w (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvaddwev.d.w xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add even-positioned signed 32-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 64-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = (s64)(s32)a.word[2 * i] + (s64)(s32)b.word[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvaddwev_d_wu (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvaddwev_d_wu (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvaddwev.d.wu xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nAdd even-positioned unsigned 32-bit elements in `a` and unsigned elements in `b`, save the 64-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = (u64)(u32)a.word[2 * i] + (u64)(u32)b.word[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvaddwev_d_wu (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvaddwev.d.wu xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add even-positioned unsigned 32-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 64-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = (u64)(u32)a.word[2 * i] + (u64)(u32)b.word[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvaddwev_d_wu_w (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvaddwev_d_wu_w (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvaddwev.d.wu.w xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nAdd even-positioned unsigned 32-bit elements in `a` and signed elements in `b`, save the 64-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = (u64)(u32)a.word[2 * i] + (s64)(s32)b.word[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvaddwev_d_wu_w (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvaddwev.d.wu.w xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add even-positioned unsigned 32-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 64-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = (u64)(u32)a.word[2 * i] + (s64)(s32)b.word[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvaddwev_h_b (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvaddwev_h_b (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvaddwev.h.b xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nAdd even-positioned signed 8-bit elements in `a` and signed elements in `b`, save the 16-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = (s16)(s8)a.byte[2 * i] + (s16)(s8)b.byte[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvaddwev_h_b (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvaddwev.h.b xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add even-positioned signed 8-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 16-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = (s16)(s8)a.byte[2 * i] + (s16)(s8)b.byte[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvaddwev_h_bu (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvaddwev_h_bu (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvaddwev.h.bu xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nAdd even-positioned unsigned 8-bit elements in `a` and unsigned elements in `b`, save the 16-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = (u16)(u8)a.byte[2 * i] + (u16)(u8)b.byte[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvaddwev_h_bu (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvaddwev.h.bu xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add even-positioned unsigned 8-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 16-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = (u16)(u8)a.byte[2 * i] + (u16)(u8)b.byte[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvaddwev_h_bu_b (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvaddwev_h_bu_b (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvaddwev.h.bu.b xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nAdd even-positioned unsigned 8-bit elements in `a` and signed elements in `b`, save the 16-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = (u16)(u8)a.byte[2 * i] + (s16)(s8)b.byte[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvaddwev_h_bu_b (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvaddwev.h.bu.b xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add even-positioned unsigned 8-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 16-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = (u16)(u8)a.byte[2 * i] + (s16)(s8)b.byte[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvaddwev_q_d (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvaddwev_q_d (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvaddwev.q.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nAdd even-positioned signed 64-bit elements in `a` and signed elements in `b`, save the 128-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.qword[i] = (s128)(s64)a.dword[2 * i] + (s128)(s64)b.dword[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvaddwev_q_d (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvaddwev.q.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add even-positioned signed 64-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 128-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.qword[i] = (s128)(s64)a.dword[2 * i] + (s128)(s64)b.dword[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvaddwev_q_du (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvaddwev_q_du (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvaddwev.q.du xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nAdd even-positioned unsigned 64-bit elements in `a` and unsigned elements in `b`, save the 128-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.qword[i] = (u128)(u64)a.dword[2 * i] + (u128)(u64)b.dword[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvaddwev_q_du (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvaddwev.q.du xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add even-positioned unsigned 64-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 128-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.qword[i] = (u128)(u64)a.dword[2 * i] + (u128)(u64)b.dword[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvaddwev_q_du_d (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvaddwev_q_du_d (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvaddwev.q.du.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nAdd even-positioned unsigned 64-bit elements in `a` and signed elements in `b`, save the 128-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.qword[i] = (u128)(u64)a.dword[2 * i] + (s128)(s64)b.dword[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvaddwev_q_du_d (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvaddwev.q.du.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add even-positioned unsigned 64-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 128-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.qword[i] = (u128)(u64)a.dword[2 * i] + (s128)(s64)b.dword[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvaddwev_w_h (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvaddwev_w_h (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvaddwev.w.h xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nAdd even-positioned signed 16-bit elements in `a` and signed elements in `b`, save the 32-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = (s32)(s16)a.half[2 * i] + (s32)(s16)b.half[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvaddwev_w_h (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvaddwev.w.h xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add even-positioned signed 16-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 32-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = (s32)(s16)a.half[2 * i] + (s32)(s16)b.half[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvaddwev_w_hu (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvaddwev_w_hu (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvaddwev.w.hu xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nAdd even-positioned unsigned 16-bit elements in `a` and unsigned elements in `b`, save the 32-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = (u32)(u16)a.half[2 * i] + (u32)(u16)b.half[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvaddwev_w_hu (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvaddwev.w.hu xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add even-positioned unsigned 16-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 32-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = (u32)(u16)a.half[2 * i] + (u32)(u16)b.half[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvaddwev_w_hu_h (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvaddwev_w_hu_h (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvaddwev.w.hu.h xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nAdd even-positioned unsigned 16-bit elements in `a` and signed elements in `b`, save the 32-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = (u32)(u16)a.half[2 * i] + (s32)(s16)b.half[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvaddwev_w_hu_h (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvaddwev.w.hu.h xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add even-positioned unsigned 16-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 32-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = (u32)(u16)a.half[2 * i] + (s32)(s16)b.half[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvaddwod_d_w (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvaddwod_d_w (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvaddwod.d.w xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nAdd odd-positioned signed 32-bit elements in `a` and signed elements in `b`, save the 64-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = (s64)(s32)a.word[2 * i + 1] + (s64)(s32)b.word[2 * i + 1];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvaddwod_d_w (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvaddwod.d.w xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add odd-positioned signed 32-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 64-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = (s64)(s32)a.word[2 * i + 1] + (s64)(s32)b.word[2 * i + 1];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvaddwod_d_wu (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvaddwod_d_wu (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvaddwod.d.wu xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nAdd odd-positioned unsigned 32-bit elements in `a` and unsigned elements in `b`, save the 64-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = (u64)(u32)a.word[2 * i + 1] + (u64)(u32)b.word[2 * i + 1];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvaddwod_d_wu (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvaddwod.d.wu xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add odd-positioned unsigned 32-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 64-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = (u64)(u32)a.word[2 * i + 1] + (u64)(u32)b.word[2 * i + 1];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvaddwod_d_wu_w (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvaddwod_d_wu_w (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvaddwod.d.wu.w xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nAdd odd-positioned unsigned 32-bit elements in `a` and signed elements in `b`, save the 64-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = (u64)(u32)a.word[2 * i + 1] + (s64)(s32)b.word[2 * i + 1];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvaddwod_d_wu_w (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvaddwod.d.wu.w xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add odd-positioned unsigned 32-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 64-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = (u64)(u32)a.word[2 * i + 1] + (s64)(s32)b.word[2 * i + 1];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvaddwod_h_b (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvaddwod_h_b (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvaddwod.h.b xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nAdd odd-positioned signed 8-bit elements in `a` and signed elements in `b`, save the 16-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = (s16)(s8)a.byte[2 * i + 1] + (s16)(s8)b.byte[2 * i + 1];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvaddwod_h_b (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvaddwod.h.b xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add odd-positioned signed 8-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 16-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = (s16)(s8)a.byte[2 * i + 1] + (s16)(s8)b.byte[2 * i + 1];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvaddwod_h_bu (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvaddwod_h_bu (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvaddwod.h.bu xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nAdd odd-positioned unsigned 8-bit elements in `a` and unsigned elements in `b`, save the 16-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = (u16)(u8)a.byte[2 * i + 1] + (u16)(u8)b.byte[2 * i + 1];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvaddwod_h_bu (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvaddwod.h.bu xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add odd-positioned unsigned 8-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 16-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = (u16)(u8)a.byte[2 * i + 1] + (u16)(u8)b.byte[2 * i + 1];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvaddwod_h_bu_b (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvaddwod_h_bu_b (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvaddwod.h.bu.b xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nAdd odd-positioned unsigned 8-bit elements in `a` and signed elements in `b`, save the 16-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = (u16)(u8)a.byte[2 * i + 1] + (s16)(s8)b.byte[2 * i + 1];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvaddwod_h_bu_b (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvaddwod.h.bu.b xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add odd-positioned unsigned 8-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 16-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = (u16)(u8)a.byte[2 * i + 1] + (s16)(s8)b.byte[2 * i + 1];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvaddwod_q_d (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvaddwod_q_d (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvaddwod.q.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nAdd odd-positioned signed 64-bit elements in `a` and signed elements in `b`, save the 128-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.qword[i] = (s128)(s64)a.dword[2 * i + 1] + (s128)(s64)b.dword[2 * i + 1];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvaddwod_q_d (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvaddwod.q.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add odd-positioned signed 64-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 128-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.qword[i] = (s128)(s64)a.dword[2 * i + 1] + (s128)(s64)b.dword[2 * i + 1];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvaddwod_q_du (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvaddwod_q_du (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvaddwod.q.du xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nAdd odd-positioned unsigned 64-bit elements in `a` and unsigned elements in `b`, save the 128-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] + (u128)(u64)b.dword[2 * i + 1];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvaddwod_q_du (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvaddwod.q.du xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add odd-positioned unsigned 64-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 128-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] + (u128)(u64)b.dword[2 * i + 1];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvaddwod_q_du_d (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvaddwod_q_du_d (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvaddwod.q.du.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nAdd odd-positioned unsigned 64-bit elements in `a` and signed elements in `b`, save the 128-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] + (s128)(s64)b.dword[2 * i + 1];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvaddwod_q_du_d (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvaddwod.q.du.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add odd-positioned unsigned 64-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 128-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] + (s128)(s64)b.dword[2 * i + 1];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvaddwod_w_h (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvaddwod_w_h (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvaddwod.w.h xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nAdd odd-positioned signed 16-bit elements in `a` and signed elements in `b`, save the 32-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = (s32)(s16)a.half[2 * i + 1] + (s32)(s16)b.half[2 * i + 1];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvaddwod_w_h (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvaddwod.w.h xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add odd-positioned signed 16-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 32-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = (s32)(s16)a.half[2 * i + 1] + (s32)(s16)b.half[2 * i + 1];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvaddwod_w_hu (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvaddwod_w_hu (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvaddwod.w.hu xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nAdd odd-positioned unsigned 16-bit elements in `a` and unsigned elements in `b`, save the 32-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = (u32)(u16)a.half[2 * i + 1] + (u32)(u16)b.half[2 * i + 1];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvaddwod_w_hu (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvaddwod.w.hu xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add odd-positioned unsigned 16-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 32-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = (u32)(u16)a.half[2 * i + 1] + (u32)(u16)b.half[2 * i + 1];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvaddwod_w_hu_h (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvaddwod_w_hu_h (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvaddwod.w.hu.h xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nAdd odd-positioned unsigned 16-bit elements in `a` and signed elements in `b`, save the 32-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = (u32)(u16)a.half[2 * i + 1] + (s32)(s16)b.half[2 * i + 1];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvaddwod_w_hu_h (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvaddwod.w.hu.h xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add odd-positioned unsigned 16-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 32-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = (u32)(u16)a.half[2 * i + 1] + (s32)(s16)b.half[2 * i + 1];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvand_v (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvand_v (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvand.v xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompute bitwise AND between elements in `a` and `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = a.dword[i] & b.dword[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvand_v (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvand.v xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute bitwise AND between elements in <code>a</code> and <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = a.dword[i] &amp; b.dword[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Logical", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvandi_b (__m256i a, imm0_255 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvandi_b (__m256i a, imm0_255 imm)\n#include <lasxintrin.h>\nInstruction: xvandi.b xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nCompute bitwise AND between elements in `a` and `imm`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 32; i++) {\n  dst.byte[i] = a.byte[i] & imm;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvandi_b (__m256i a, imm0_255 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvandi.b xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute bitwise AND between elements in <code>a</code> and <code>imm</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 32; i++) {\n  dst.byte[i] = a.byte[i] &amp; imm;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Logical", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvandn_v (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvandn_v (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvandn.v xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompute bitwise ANDN between elements in `a` and `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = b.dword[i] & (~a.dword[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvandn_v (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvandn.v xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute bitwise ANDN between elements in <code>a</code> and <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = b.dword[i] &amp; (~a.dword[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Logical", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvavg_b (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvavg_b (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvavg.b xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompute the average (rounded towards negative infinity) of signed 8-bit elements in `a` and `b`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 32; i++) {\n  dst.byte[i] = ((s8)a.byte[i] >> 1) + ((s8)b.byte[i] >> 1) +\n                ((a.byte[i] & b.byte[i]) & 1);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvavg_b (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvavg.b xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute the average (rounded towards negative infinity) of signed 8-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 32; i++) {\n  dst.byte[i] = ((s8)a.byte[i] &gt;&gt; 1) + ((s8)b.byte[i] &gt;&gt; 1) +\n                ((a.byte[i] &amp; b.byte[i]) &amp; 1);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvavg_bu (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvavg_bu (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvavg.bu xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompute the average (rounded towards negative infinity) of unsigned 8-bit elements in `a` and `b`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 32; i++) {\n  dst.byte[i] = ((u8)a.byte[i] >> 1) + ((u8)b.byte[i] >> 1) +\n                ((a.byte[i] & b.byte[i]) & 1);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvavg_bu (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvavg.bu xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute the average (rounded towards negative infinity) of unsigned 8-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 32; i++) {\n  dst.byte[i] = ((u8)a.byte[i] &gt;&gt; 1) + ((u8)b.byte[i] &gt;&gt; 1) +\n                ((a.byte[i] &amp; b.byte[i]) &amp; 1);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvavg_d (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvavg_d (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvavg.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompute the average (rounded towards negative infinity) of signed 64-bit elements in `a` and `b`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = ((s64)a.dword[i] >> 1) + ((s64)b.dword[i] >> 1) +\n                 ((a.dword[i] & b.dword[i]) & 1);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvavg_d (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvavg.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute the average (rounded towards negative infinity) of signed 64-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = ((s64)a.dword[i] &gt;&gt; 1) + ((s64)b.dword[i] &gt;&gt; 1) +\n                 ((a.dword[i] &amp; b.dword[i]) &amp; 1);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvavg_du (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvavg_du (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvavg.du xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompute the average (rounded towards negative infinity) of unsigned 64-bit elements in `a` and `b`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = ((u64)a.dword[i] >> 1) + ((u64)b.dword[i] >> 1) +\n                 ((a.dword[i] & b.dword[i]) & 1);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvavg_du (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvavg.du xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute the average (rounded towards negative infinity) of unsigned 64-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = ((u64)a.dword[i] &gt;&gt; 1) + ((u64)b.dword[i] &gt;&gt; 1) +\n                 ((a.dword[i] &amp; b.dword[i]) &amp; 1);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvavg_h (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvavg_h (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvavg.h xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompute the average (rounded towards negative infinity) of signed 16-bit elements in `a` and `b`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = ((s16)a.half[i] >> 1) + ((s16)b.half[i] >> 1) +\n                ((a.half[i] & b.half[i]) & 1);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvavg_h (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvavg.h xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute the average (rounded towards negative infinity) of signed 16-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = ((s16)a.half[i] &gt;&gt; 1) + ((s16)b.half[i] &gt;&gt; 1) +\n                ((a.half[i] &amp; b.half[i]) &amp; 1);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvavg_hu (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvavg_hu (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvavg.hu xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompute the average (rounded towards negative infinity) of unsigned 16-bit elements in `a` and `b`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = ((u16)a.half[i] >> 1) + ((u16)b.half[i] >> 1) +\n                ((a.half[i] & b.half[i]) & 1);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvavg_hu (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvavg.hu xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute the average (rounded towards negative infinity) of unsigned 16-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = ((u16)a.half[i] &gt;&gt; 1) + ((u16)b.half[i] &gt;&gt; 1) +\n                ((a.half[i] &amp; b.half[i]) &amp; 1);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvavg_w (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvavg_w (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvavg.w xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompute the average (rounded towards negative infinity) of signed 32-bit elements in `a` and `b`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = ((s32)a.word[i] >> 1) + ((s32)b.word[i] >> 1) +\n                ((a.word[i] & b.word[i]) & 1);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvavg_w (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvavg.w xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute the average (rounded towards negative infinity) of signed 32-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = ((s32)a.word[i] &gt;&gt; 1) + ((s32)b.word[i] &gt;&gt; 1) +\n                ((a.word[i] &amp; b.word[i]) &amp; 1);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvavg_wu (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvavg_wu (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvavg.wu xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompute the average (rounded towards negative infinity) of unsigned 32-bit elements in `a` and `b`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = ((u32)a.word[i] >> 1) + ((u32)b.word[i] >> 1) +\n                ((a.word[i] & b.word[i]) & 1);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvavg_wu (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvavg.wu xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute the average (rounded towards negative infinity) of unsigned 32-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = ((u32)a.word[i] &gt;&gt; 1) + ((u32)b.word[i] &gt;&gt; 1) +\n                ((a.word[i] &amp; b.word[i]) &amp; 1);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvavgr_b (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvavgr_b (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvavgr.b xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompute the average (rounded towards positive infinity) of signed 8-bit elements in `a` and `b`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 32; i++) {\n  dst.byte[i] = ((s8)a.byte[i] >> 1) + ((s8)b.byte[i] >> 1) +\n                ((a.byte[i] | b.byte[i]) & 1);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvavgr_b (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvavgr.b xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute the average (rounded towards positive infinity) of signed 8-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 32; i++) {\n  dst.byte[i] = ((s8)a.byte[i] &gt;&gt; 1) + ((s8)b.byte[i] &gt;&gt; 1) +\n                ((a.byte[i] | b.byte[i]) &amp; 1);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvavgr_bu (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvavgr_bu (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvavgr.bu xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompute the average (rounded towards positive infinity) of unsigned 8-bit elements in `a` and `b`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 32; i++) {\n  dst.byte[i] = ((u8)a.byte[i] >> 1) + ((u8)b.byte[i] >> 1) +\n                ((a.byte[i] | b.byte[i]) & 1);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvavgr_bu (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvavgr.bu xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute the average (rounded towards positive infinity) of unsigned 8-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 32; i++) {\n  dst.byte[i] = ((u8)a.byte[i] &gt;&gt; 1) + ((u8)b.byte[i] &gt;&gt; 1) +\n                ((a.byte[i] | b.byte[i]) &amp; 1);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvavgr_d (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvavgr_d (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvavgr.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompute the average (rounded towards positive infinity) of signed 64-bit elements in `a` and `b`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = ((s64)a.dword[i] >> 1) + ((s64)b.dword[i] >> 1) +\n                 ((a.dword[i] | b.dword[i]) & 1);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvavgr_d (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvavgr.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute the average (rounded towards positive infinity) of signed 64-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = ((s64)a.dword[i] &gt;&gt; 1) + ((s64)b.dword[i] &gt;&gt; 1) +\n                 ((a.dword[i] | b.dword[i]) &amp; 1);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvavgr_du (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvavgr_du (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvavgr.du xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompute the average (rounded towards positive infinity) of unsigned 64-bit elements in `a` and `b`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = ((u64)a.dword[i] >> 1) + ((u64)b.dword[i] >> 1) +\n                 ((a.dword[i] | b.dword[i]) & 1);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvavgr_du (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvavgr.du xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute the average (rounded towards positive infinity) of unsigned 64-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = ((u64)a.dword[i] &gt;&gt; 1) + ((u64)b.dword[i] &gt;&gt; 1) +\n                 ((a.dword[i] | b.dword[i]) &amp; 1);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvavgr_h (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvavgr_h (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvavgr.h xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompute the average (rounded towards positive infinity) of signed 16-bit elements in `a` and `b`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = ((s16)a.half[i] >> 1) + ((s16)b.half[i] >> 1) +\n                ((a.half[i] | b.half[i]) & 1);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvavgr_h (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvavgr.h xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute the average (rounded towards positive infinity) of signed 16-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = ((s16)a.half[i] &gt;&gt; 1) + ((s16)b.half[i] &gt;&gt; 1) +\n                ((a.half[i] | b.half[i]) &amp; 1);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvavgr_hu (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvavgr_hu (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvavgr.hu xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompute the average (rounded towards positive infinity) of unsigned 16-bit elements in `a` and `b`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = ((u16)a.half[i] >> 1) + ((u16)b.half[i] >> 1) +\n                ((a.half[i] | b.half[i]) & 1);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvavgr_hu (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvavgr.hu xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute the average (rounded towards positive infinity) of unsigned 16-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = ((u16)a.half[i] &gt;&gt; 1) + ((u16)b.half[i] &gt;&gt; 1) +\n                ((a.half[i] | b.half[i]) &amp; 1);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvavgr_w (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvavgr_w (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvavgr.w xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompute the average (rounded towards positive infinity) of signed 32-bit elements in `a` and `b`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = ((s32)a.word[i] >> 1) + ((s32)b.word[i] >> 1) +\n                ((a.word[i] | b.word[i]) & 1);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvavgr_w (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvavgr.w xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute the average (rounded towards positive infinity) of signed 32-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = ((s32)a.word[i] &gt;&gt; 1) + ((s32)b.word[i] &gt;&gt; 1) +\n                ((a.word[i] | b.word[i]) &amp; 1);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvavgr_wu (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvavgr_wu (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvavgr.wu xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompute the average (rounded towards positive infinity) of unsigned 32-bit elements in `a` and `b`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = ((u32)a.word[i] >> 1) + ((u32)b.word[i] >> 1) +\n                ((a.word[i] | b.word[i]) & 1);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvavgr_wu (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvavgr.wu xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute the average (rounded towards positive infinity) of unsigned 32-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = ((u32)a.word[i] &gt;&gt; 1) + ((u32)b.word[i] &gt;&gt; 1) +\n                ((a.word[i] | b.word[i]) &amp; 1);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvbitclr_b (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvbitclr_b (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvbitclr.b xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nClear the bit specified by elements in `b` from 8-bit elements in `a`, save the result in `dst`.\n\n\n\n\n### Examples\n\n```c++\n__m256i __lasx_xvbitclr_b(__m256i{0xffffffffffffffff, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{0xabababababababab, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678})\n= 0xf7f7f7f7f7f7f7f7 0x99aabbccd5ecf700 0xabcdeb0212341234 0xaabaaaba9dee9dee\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 32; i++) {\n  dst.byte[i] = a.byte[i] & (~((u8)1 << (b.byte[i] % 8)));\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvbitclr_b (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvbitclr.b xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Clear the bit specified by elements in <code>b</code> from 8-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvbitclr_b(__m256i{0xffffffffffffffff, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{0xabababababababab, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678})\n= 0xf7f7f7f7f7f7f7f7 0x99aabbccd5ecf700 0xabcdeb0212341234 0xaabaaaba9dee9dee\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 32; i++) {\n  dst.byte[i] = a.byte[i] &amp; (~((u8)1 &lt;&lt; (b.byte[i] % 8)));\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Bitwise Operations", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvbitclr_d (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvbitclr_d (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvbitclr.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nClear the bit specified by elements in `b` from 64-bit elements in `a`, save the result in `dst`.\n\n\n\n\n### Examples\n\n```c++\n__m256i __lasx_xvbitclr_d(__m256i{0xffffffffffffffff, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{0xabababababababab, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678})\n= 0xfffff7ffffffffff 0x99aabbccddeeff00 0xabcdef1012341234 0xaabbaabbddeeddee\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = a.dword[i] & (~((u64)1 << (b.dword[i] % 64)));\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvbitclr_d (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvbitclr.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Clear the bit specified by elements in <code>b</code> from 64-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvbitclr_d(__m256i{0xffffffffffffffff, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{0xabababababababab, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678})\n= 0xfffff7ffffffffff 0x99aabbccddeeff00 0xabcdef1012341234 0xaabbaabbddeeddee\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = a.dword[i] &amp; (~((u64)1 &lt;&lt; (b.dword[i] % 64)));\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Bitwise Operations", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvbitclr_h (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvbitclr_h (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvbitclr.h xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nClear the bit specified by elements in `b` from 16-bit elements in `a`, save the result in `dst`.\n\n\n\n\n### Examples\n\n```c++\n__m256i __lasx_xvbitclr_h(__m256i{0xffffffffffffffff, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{0xabababababababab, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678})\n= 0xf7fff7fff7fff7ff 0x99aabbccddecff00 0xabcdef0212341234 0xaabbaabbdceedcee\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = a.half[i] & (~((u16)1 << (b.half[i] % 16)));\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvbitclr_h (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvbitclr.h xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Clear the bit specified by elements in <code>b</code> from 16-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvbitclr_h(__m256i{0xffffffffffffffff, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{0xabababababababab, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678})\n= 0xf7fff7fff7fff7ff 0x99aabbccddecff00 0xabcdef0212341234 0xaabbaabbdceedcee\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = a.half[i] &amp; (~((u16)1 &lt;&lt; (b.half[i] % 16)));\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Bitwise Operations", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvbitclr_w (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvbitclr_w (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvbitclr.w xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nClear the bit specified by elements in `b` from 32-bit elements in `a`, save the result in `dst`.\n\n\n\n\n### Examples\n\n```c++\n__m256i __lasx_xvbitclr_w(__m256i{0xffffffffffffffff, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{0xabababababababab, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678})\n= 0xfffff7fffffff7ff 0x99aabbccddeeff00 0xabcdef1212341234 0xaabbaabbdceeddee\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = a.word[i] & (~((u32)1 << (b.word[i] % 32)));\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvbitclr_w (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvbitclr.w xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Clear the bit specified by elements in <code>b</code> from 32-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvbitclr_w(__m256i{0xffffffffffffffff, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{0xabababababababab, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678})\n= 0xfffff7fffffff7ff 0x99aabbccddeeff00 0xabcdef1212341234 0xaabbaabbdceeddee\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = a.word[i] &amp; (~((u32)1 &lt;&lt; (b.word[i] % 32)));\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Bitwise Operations", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvbitclri_b (__m256i a, imm0_7 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvbitclri_b (__m256i a, imm0_7 imm)\n#include <lasxintrin.h>\nInstruction: xvbitclri.b xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nClear the bit specified by `imm` from 8-bit elements in `a`, save the result in `dst`.\n\n\n\n\n### Examples\n\n```c++\n__m256i __lasx_xvbitclri_b( __m256i{ 0xffffffffffffffff, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, 1)\n= 0xfdfdfdfdfdfdfdfd 0x99a8b9ccddecfd00 0xa9cded1010341034 0xa8b9a8b9ddecddec\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 32; i++) {\n  dst.byte[i] = a.byte[i] & (~((u8)1 << imm));\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvbitclri_b (__m256i a, imm0_7 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvbitclri.b xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Clear the bit specified by <code>imm</code> from 8-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvbitclri_b( __m256i{ 0xffffffffffffffff, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, 1)\n= 0xfdfdfdfdfdfdfdfd 0x99a8b9ccddecfd00 0xa9cded1010341034 0xa8b9a8b9ddecddec\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 32; i++) {\n  dst.byte[i] = a.byte[i] &amp; (~((u8)1 &lt;&lt; imm));\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Bitwise Operations", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvbitclri_d (__m256i a, imm0_63 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvbitclri_d (__m256i a, imm0_63 imm)\n#include <lasxintrin.h>\nInstruction: xvbitclri.d xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nClear the bit specified by `imm` from 64-bit elements in `a`, save the result in `dst`.\n\n\n\n\n### Examples\n\n```c++\n__m256i __lasx_xvbitclri_d( __m256i{ 0xffffffffffffffff, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, 1)\n= 0xfffffffffffffffd 0x99aabbccddeeff00 0xabcdef1212341234 0xaabbaabbddeeddec\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = a.dword[i] & (~((u64)1 << imm));\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvbitclri_d (__m256i a, imm0_63 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvbitclri.d xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Clear the bit specified by <code>imm</code> from 64-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvbitclri_d( __m256i{ 0xffffffffffffffff, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, 1)\n= 0xfffffffffffffffd 0x99aabbccddeeff00 0xabcdef1212341234 0xaabbaabbddeeddec\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = a.dword[i] &amp; (~((u64)1 &lt;&lt; imm));\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Bitwise Operations", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvbitclri_h (__m256i a, imm0_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvbitclri_h (__m256i a, imm0_15 imm)\n#include <lasxintrin.h>\nInstruction: xvbitclri.h xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nClear the bit specified by `imm` from 16-bit elements in `a`, save the result in `dst`.\n\n\n\n\n### Examples\n\n```c++\n__m256i __lasx_xvbitclri_h( __m256i{ 0xffffffffffffffff, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, 1)\n= 0xfffdfffdfffdfffd 0x99a8bbccddecff00 0xabcdef1012341234 0xaab9aab9ddecddec\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = a.half[i] & (~((u16)1 << imm));\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvbitclri_h (__m256i a, imm0_15 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvbitclri.h xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Clear the bit specified by <code>imm</code> from 16-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvbitclri_h( __m256i{ 0xffffffffffffffff, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, 1)\n= 0xfffdfffdfffdfffd 0x99a8bbccddecff00 0xabcdef1012341234 0xaab9aab9ddecddec\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = a.half[i] &amp; (~((u16)1 &lt;&lt; imm));\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Bitwise Operations", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvbitclri_w (__m256i a, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvbitclri_w (__m256i a, imm0_31 imm)\n#include <lasxintrin.h>\nInstruction: xvbitclri.w xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nClear the bit specified by `imm` from 32-bit elements in `a`, save the result in `dst`.\n\n\n\n\n### Examples\n\n```c++\n__m256i __lasx_xvbitclri_w( __m256i{ 0xffffffffffffffff, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, 1)\n= 0xfffffffdfffffffd 0x99aabbccddeeff00 0xabcdef1012341234 0xaabbaab9ddeeddec\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = a.word[i] & (~((u32)1 << imm));\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvbitclri_w (__m256i a, imm0_31 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvbitclri.w xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Clear the bit specified by <code>imm</code> from 32-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvbitclri_w( __m256i{ 0xffffffffffffffff, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, 1)\n= 0xfffffffdfffffffd 0x99aabbccddeeff00 0xabcdef1012341234 0xaabbaab9ddeeddec\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = a.word[i] &amp; (~((u32)1 &lt;&lt; imm));\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Bitwise Operations", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvbitrev_b (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvbitrev_b (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvbitrev.b xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nToggle the bit specified by elements in `b` from 8-bit elements in `a`, save the result in `dst`.\n\n\n\n\n### Examples\n\n```c++\n__m256i __lasx_xvbitrev_b(__m256i{0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{0xabababababababab, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678})\n= 0x0707070707070707 0x9dbabfdcd5ecf702 0xafddeb021a361a36 0xeabaeaba9def9def\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 32; i++) {\n  dst.byte[i] = a.byte[i] ^ ((u8)1 << (b.byte[i] % 8));\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvbitrev_b (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvbitrev.b xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Toggle the bit specified by elements in <code>b</code> from 8-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvbitrev_b(__m256i{0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{0xabababababababab, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678})\n= 0x0707070707070707 0x9dbabfdcd5ecf702 0xafddeb021a361a36 0xeabaeaba9def9def\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 32; i++) {\n  dst.byte[i] = a.byte[i] ^ ((u8)1 &lt;&lt; (b.byte[i] % 8));\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Bitwise Operations", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvbitrev_d (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvbitrev_d (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvbitrev.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nToggle the bit specified by elements in `b` from 64-bit elements in `a`, save the result in `dst`.\n\n\n\n\n### Examples\n\n```c++\n__m256i __lasx_xvbitrev_d(__m256i{0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{0xabababababababab, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678})\n= 0x0f0f070f0f0f0f0f 0x99aabbceddeeff00 0xabcdef1012341234 0xabbbaabbddeeddee\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = a.dword[i] ^ ((u64)1 << (b.dword[i] % 64));\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvbitrev_d (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvbitrev.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Toggle the bit specified by elements in <code>b</code> from 64-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvbitrev_d(__m256i{0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{0xabababababababab, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678})\n= 0x0f0f070f0f0f0f0f 0x99aabbceddeeff00 0xabcdef1012341234 0xabbbaabbddeeddee\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = a.dword[i] ^ ((u64)1 &lt;&lt; (b.dword[i] % 64));\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Bitwise Operations", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvbitrev_h (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvbitrev_h (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvbitrev.h xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nToggle the bit specified by elements in `b` from 16-bit elements in `a`, save the result in `dst`.\n\n\n\n\n### Examples\n\n```c++\n__m256i __lasx_xvbitrev_h(__m256i{0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{0xabababababababab, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678})\n= 0x070f070f070f070f 0x99babbdcddecff02 0xabddef0212361236 0xabbbabbbdceedcee\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = a.half[i] ^ ((u16)1 << (b.half[i] % 16));\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvbitrev_h (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvbitrev.h xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Toggle the bit specified by elements in <code>b</code> from 16-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvbitrev_h(__m256i{0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{0xabababababababab, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678})\n= 0x070f070f070f070f 0x99babbdcddecff02 0xabddef0212361236 0xabbbabbbdceedcee\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = a.half[i] ^ ((u16)1 &lt;&lt; (b.half[i] % 16));\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Bitwise Operations", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvbitrev_w (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvbitrev_w (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvbitrev.w xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nToggle the bit specified by elements in `b` from 32-bit elements in `a`, save the result in `dst`.\n\n\n\n\n### Examples\n\n```c++\n__m256i __lasx_xvbitrev_w(__m256i{0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{0xabababababababab, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678})\n= 0x0f0f070f0f0f070f 0x99babbccddeeff02 0xabddef1212341236 0xabbbaabbdceeddee\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = a.word[i] ^ ((u32)1 << (b.word[i] % 32));\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvbitrev_w (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvbitrev.w xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Toggle the bit specified by elements in <code>b</code> from 32-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvbitrev_w(__m256i{0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{0xabababababababab, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678})\n= 0x0f0f070f0f0f070f 0x99babbccddeeff02 0xabddef1212341236 0xabbbaabbdceeddee\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = a.word[i] ^ ((u32)1 &lt;&lt; (b.word[i] % 32));\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Bitwise Operations", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvbitrevi_b (__m256i a, imm0_7 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvbitrevi_b (__m256i a, imm0_7 imm)\n#include <lasxintrin.h>\nInstruction: xvbitrevi.b xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nToggle the bit specified by `imm` from 8-bit elements in `a`, save the result in `dst`.\n\n\n\n\n### Examples\n\n```c++\n__m256i __lasx_xvbitrevi_b( __m256i{ 0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, 1)\n= 0x0d0d0d0d0d0d0d0d 0x9ba8b9cedfecfd02 0xa9cfed1010361036 0xa8b9a8b9dfecdfec\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 32; i++) {\n  dst.byte[i] = a.byte[i] ^ ((u8)1 << imm);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvbitrevi_b (__m256i a, imm0_7 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvbitrevi.b xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Toggle the bit specified by <code>imm</code> from 8-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvbitrevi_b( __m256i{ 0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, 1)\n= 0x0d0d0d0d0d0d0d0d 0x9ba8b9cedfecfd02 0xa9cfed1010361036 0xa8b9a8b9dfecdfec\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 32; i++) {\n  dst.byte[i] = a.byte[i] ^ ((u8)1 &lt;&lt; imm);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Bitwise Operations", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvbitrevi_d (__m256i a, imm0_63 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvbitrevi_d (__m256i a, imm0_63 imm)\n#include <lasxintrin.h>\nInstruction: xvbitrevi.d xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nToggle the bit specified by `imm` from 64-bit elements in `a`, save the result in `dst`.\n\n\n\n\n### Examples\n\n```c++\n__m256i __lasx_xvbitrevi_d( __m256i{ 0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, 1)\n= 0x0f0f0f0f0f0f0f0d 0x99aabbccddeeff02 0xabcdef1212341236 0xaabbaabbddeeddec\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = a.dword[i] ^ ((u64)1 << imm);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvbitrevi_d (__m256i a, imm0_63 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvbitrevi.d xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Toggle the bit specified by <code>imm</code> from 64-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvbitrevi_d( __m256i{ 0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, 1)\n= 0x0f0f0f0f0f0f0f0d 0x99aabbccddeeff02 0xabcdef1212341236 0xaabbaabbddeeddec\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = a.dword[i] ^ ((u64)1 &lt;&lt; imm);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Bitwise Operations", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvbitrevi_h (__m256i a, imm0_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvbitrevi_h (__m256i a, imm0_15 imm)\n#include <lasxintrin.h>\nInstruction: xvbitrevi.h xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nToggle the bit specified by `imm` from 16-bit elements in `a`, save the result in `dst`.\n\n\n\n\n### Examples\n\n```c++\n__m256i __lasx_xvbitrevi_h( __m256i{ 0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, 1)\n= 0x0f0d0f0d0f0d0f0d 0x99a8bbceddecff02 0xabcfef1012361236 0xaab9aab9ddecddec\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = a.half[i] ^ ((u16)1 << imm);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvbitrevi_h (__m256i a, imm0_15 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvbitrevi.h xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Toggle the bit specified by <code>imm</code> from 16-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvbitrevi_h( __m256i{ 0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, 1)\n= 0x0f0d0f0d0f0d0f0d 0x99a8bbceddecff02 0xabcfef1012361236 0xaab9aab9ddecddec\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = a.half[i] ^ ((u16)1 &lt;&lt; imm);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Bitwise Operations", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvbitrevi_w (__m256i a, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvbitrevi_w (__m256i a, imm0_31 imm)\n#include <lasxintrin.h>\nInstruction: xvbitrevi.w xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nToggle the bit specified by `imm` from 32-bit elements in `a`, save the result in `dst`.\n\n\n\n\n### Examples\n\n```c++\n__m256i __lasx_xvbitrevi_w( __m256i{ 0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, 1)\n= 0x0f0f0f0d0f0f0f0d 0x99aabbceddeeff02 0xabcdef1012341236 0xaabbaab9ddeeddec\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = a.word[i] ^ ((u32)1 << imm);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvbitrevi_w (__m256i a, imm0_31 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvbitrevi.w xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Toggle the bit specified by <code>imm</code> from 32-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvbitrevi_w( __m256i{ 0x0f0f0f0f0f0f0f0f, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, 1)\n= 0x0f0f0f0d0f0f0f0d 0x99aabbceddeeff02 0xabcdef1012341236 0xaabbaab9ddeeddec\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = a.word[i] ^ ((u32)1 &lt;&lt; imm);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Bitwise Operations", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvbitsel_v (__m256i a, __m256i b, __m256i c)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvbitsel_v (__m256i a, __m256i b, __m256i c)\n#include <lasxintrin.h>\nInstruction: xvbitsel.v xr, xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompute bitwise selection: for each bit position, if the bit in `c` equals to one, copy the bit from `b` to `dst`, otherwise copy from `a`.\n\n\n\n\n### Examples\n\n```c++\n__m256i __lasx_xvbitsel_v(__m256i{0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{0xabababababababab, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678}, __m256i{0xffff0000aaaabbbb, 0x1111222233334444, 0x00000000ffffffff, 0xffffffff00000000})\n= 0xabab3344ffeeefab 0x98ba9beccfedfb00 0xabcdef1243214321 0x56785678ddeeddee\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = (c.dword[i] & b.dword[i]) | (~c.dword[i] & a.dword[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 2 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvbitsel_v (__m256i a, __m256i b, __m256i c)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvbitsel.v xr, xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute bitwise selection: for each bit position, if the bit in <code>c</code> equals to one, copy the bit from <code>b</code> to <code>dst</code>, otherwise copy from <code>a</code>.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvbitsel_v(__m256i{0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{0xabababababababab, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678}, __m256i{0xffff0000aaaabbbb, 0x1111222233334444, 0x00000000ffffffff, 0xffffffff00000000})\n= 0xabab3344ffeeefab 0x98ba9beccfedfb00 0xabcdef1243214321 0x56785678ddeeddee\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = (c.dword[i] &amp; b.dword[i]) | (~c.dword[i] &amp; a.dword[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Bitwise Operations", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvbitseli_b (__m256i a, __m256i b, imm0_255 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvbitseli_b (__m256i a, __m256i b, imm0_255 imm)\n#include <lasxintrin.h>\nInstruction: xvbitseli.b xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nCompute bitwise selection: for each bit position, if the bit in `a` equals to one, copy the bit from `imm` to `dst`, otherwise copy from `b`.\n\n\n\n\n### Examples\n\n```c++\n__m256i __lasx_xvbitseli_b( __m256i{ 0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{ 0xabababababababab, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678}, 0x12)\n= 0xba8b9aabba8b9a23 0x1216123012031221 0x1230123653115311 0x5652565212121212\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 32; i++) {\n  dst.byte[i] = (~a.byte[i] & b.byte[i]) | (a.byte[i] & (u8)imm);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 2 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvbitseli_b (__m256i a, __m256i b, imm0_255 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvbitseli.b xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute bitwise selection: for each bit position, if the bit in <code>a</code> equals to one, copy the bit from <code>imm</code> to <code>dst</code>, otherwise copy from <code>b</code>.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvbitseli_b( __m256i{ 0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{ 0xabababababababab, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678}, 0x12)\n= 0xba8b9aabba8b9a23 0x1216123012031221 0x1230123653115311 0x5652565212121212\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 32; i++) {\n  dst.byte[i] = (~a.byte[i] &amp; b.byte[i]) | (a.byte[i] &amp; (u8)imm);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Bitwise Operations", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvbitset_b (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvbitset_b (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvbitset.b xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nSet the bit specified by elements in `b` from 8-bit elements in `a`, save the result in `dst`.\n\n\n\n\n### Examples\n\n```c++\n__m256i __lasx_xvbitset_b(__m256i{0x0000000000000000, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{0xabababababababab, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678})\n= 0x0808080808080808 0x9dbabfdcddeeff02 0xafddef121a361a36 0xeabbeabbddefddef\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 32; i++) {\n  dst.byte[i] = a.byte[i] | ((u8)1 << (b.byte[i] % 8));\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvbitset_b (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvbitset.b xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Set the bit specified by elements in <code>b</code> from 8-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvbitset_b(__m256i{0x0000000000000000, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{0xabababababababab, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678})\n= 0x0808080808080808 0x9dbabfdcddeeff02 0xafddef121a361a36 0xeabbeabbddefddef\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 32; i++) {\n  dst.byte[i] = a.byte[i] | ((u8)1 &lt;&lt; (b.byte[i] % 8));\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Bitwise Operations", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvbitset_d (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvbitset_d (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvbitset.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nSet the bit specified by elements in `b` from 64-bit elements in `a`, save the result in `dst`.\n\n\n\n\n### Examples\n\n```c++\n__m256i __lasx_xvbitset_d(__m256i{0x0000000000000000, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{0xabababababababab, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678})\n= 0x0000080000000000 0x99aabbceddeeff00 0xabcdef1212341234 0xabbbaabbddeeddee\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = a.dword[i] | ((u64)1 << (b.dword[i] % 64));\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvbitset_d (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvbitset.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Set the bit specified by elements in <code>b</code> from 64-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvbitset_d(__m256i{0x0000000000000000, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{0xabababababababab, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678})\n= 0x0000080000000000 0x99aabbceddeeff00 0xabcdef1212341234 0xabbbaabbddeeddee\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = a.dword[i] | ((u64)1 &lt;&lt; (b.dword[i] % 64));\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Bitwise Operations", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvbitset_h (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvbitset_h (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvbitset.h xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nSet the bit specified by elements in `b` from 16-bit elements in `a`, save the result in `dst`.\n\n\n\n\n### Examples\n\n```c++\n__m256i __lasx_xvbitset_h(__m256i{0x0000000000000000, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{0xabababababababab, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678})\n= 0x0800080008000800 0x99babbdcddeeff02 0xabddef1212361236 0xabbbabbbddeeddee\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = a.half[i] | ((u16)1 << (b.half[i] % 16));\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvbitset_h (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvbitset.h xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Set the bit specified by elements in <code>b</code> from 16-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvbitset_h(__m256i{0x0000000000000000, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{0xabababababababab, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678})\n= 0x0800080008000800 0x99babbdcddeeff02 0xabddef1212361236 0xabbbabbbddeeddee\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = a.half[i] | ((u16)1 &lt;&lt; (b.half[i] % 16));\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Bitwise Operations", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvbitset_w (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvbitset_w (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvbitset.w xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nSet the bit specified by elements in `b` from 32-bit elements in `a`, save the result in `dst`.\n\n\n\n\n### Examples\n\n```c++\n__m256i __lasx_xvbitset_w(__m256i{0x0000000000000000, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{0xabababababababab, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678})\n= 0x0000080000000800 0x99babbccddeeff02 0xabddef1212341236 0xabbbaabbddeeddee\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = a.word[i] | ((u32)1 << (b.word[i] % 32));\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvbitset_w (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvbitset.w xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Set the bit specified by elements in <code>b</code> from 32-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvbitset_w(__m256i{0x0000000000000000, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{0xabababababababab, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678})\n= 0x0000080000000800 0x99babbccddeeff02 0xabddef1212341236 0xabbbaabbddeeddee\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = a.word[i] | ((u32)1 &lt;&lt; (b.word[i] % 32));\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Bitwise Operations", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvbitseti_b (__m256i a, imm0_7 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvbitseti_b (__m256i a, imm0_7 imm)\n#include <lasxintrin.h>\nInstruction: xvbitseti.b xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nSet the bit specified by `imm` from 8-bit elements in `a`, save the result in `dst`.\n\n\n\n\n### Examples\n\n```c++\n__m256i __lasx_xvbitseti_b( __m256i{ 0x0000000000000000, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, 1)\n= 0x0202020202020202 0x9baabbcedfeeff02 0xabcfef1212361236 0xaabbaabbdfeedfee\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 32; i++) {\n  dst.byte[i] = a.byte[i] | ((u8)1 << imm);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvbitseti_b (__m256i a, imm0_7 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvbitseti.b xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Set the bit specified by <code>imm</code> from 8-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvbitseti_b( __m256i{ 0x0000000000000000, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, 1)\n= 0x0202020202020202 0x9baabbcedfeeff02 0xabcfef1212361236 0xaabbaabbdfeedfee\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 32; i++) {\n  dst.byte[i] = a.byte[i] | ((u8)1 &lt;&lt; imm);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Bitwise Operations", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvbitseti_d (__m256i a, imm0_63 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvbitseti_d (__m256i a, imm0_63 imm)\n#include <lasxintrin.h>\nInstruction: xvbitseti.d xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nSet the bit specified by `imm` from 64-bit elements in `a`, save the result in `dst`.\n\n\n\n\n### Examples\n\n```c++\n__m256i __lasx_xvbitseti_d( __m256i{ 0x0000000000000000, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, 1)\n= 0x0000000000000002 0x99aabbccddeeff02 0xabcdef1212341236 0xaabbaabbddeeddee\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = a.dword[i] | ((u64)1 << imm);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvbitseti_d (__m256i a, imm0_63 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvbitseti.d xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Set the bit specified by <code>imm</code> from 64-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvbitseti_d( __m256i{ 0x0000000000000000, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, 1)\n= 0x0000000000000002 0x99aabbccddeeff02 0xabcdef1212341236 0xaabbaabbddeeddee\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = a.dword[i] | ((u64)1 &lt;&lt; imm);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Bitwise Operations", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvbitseti_h (__m256i a, imm0_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvbitseti_h (__m256i a, imm0_15 imm)\n#include <lasxintrin.h>\nInstruction: xvbitseti.h xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nSet the bit specified by `imm` from 16-bit elements in `a`, save the result in `dst`.\n\n\n\n\n### Examples\n\n```c++\n__m256i __lasx_xvbitseti_h( __m256i{ 0x0000000000000000, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, 1)\n= 0x0002000200020002 0x99aabbceddeeff02 0xabcfef1212361236 0xaabbaabbddeeddee\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = a.half[i] | ((u16)1 << imm);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvbitseti_h (__m256i a, imm0_15 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvbitseti.h xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Set the bit specified by <code>imm</code> from 16-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvbitseti_h( __m256i{ 0x0000000000000000, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, 1)\n= 0x0002000200020002 0x99aabbceddeeff02 0xabcfef1212361236 0xaabbaabbddeeddee\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = a.half[i] | ((u16)1 &lt;&lt; imm);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Bitwise Operations", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvbitseti_w (__m256i a, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvbitseti_w (__m256i a, imm0_31 imm)\n#include <lasxintrin.h>\nInstruction: xvbitseti.w xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nSet the bit specified by `imm` from 32-bit elements in `a`, save the result in `dst`.\n\n\n\n\n### Examples\n\n```c++\n__m256i __lasx_xvbitseti_w( __m256i{ 0x0000000000000000, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, 1)\n= 0x0000000200000002 0x99aabbceddeeff02 0xabcdef1212341236 0xaabbaabbddeeddee\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = a.word[i] | ((u32)1 << imm);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvbitseti_w (__m256i a, imm0_31 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvbitseti.w xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Set the bit specified by <code>imm</code> from 32-bit elements in <code>a</code>, save the result in <code>dst</code>.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvbitseti_w( __m256i{ 0x0000000000000000, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, 1)\n= 0x0000000200000002 0x99aabbceddeeff02 0xabcdef1212341236 0xaabbaabbddeeddee\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = a.word[i] | ((u32)1 &lt;&lt; imm);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Bitwise Operations", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvbsll_v (__m256i a, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvbsll_v (__m256i a, imm0_31 imm)\n#include <lasxintrin.h>\nInstruction: xvbsll.v xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nCompute whole vector `a` shifted left by `imm * 8` bits.\n\n\n\n\n\n### Operation\n\n```c++\nint shift = (imm * 8) % 128;\ndst.qword[0] = (u128)a.qword[0] << shift;\ndst.qword[1] = (u128)a.qword[1] << shift;\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvbsll_v (__m256i a, imm0_31 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvbsll.v xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute whole vector <code>a</code> shifted left by <code>imm * 8</code> bits.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">int shift = (imm * 8) % 128;\ndst.qword[0] = (u128)a.qword[0] &lt;&lt; shift;\ndst.qword[1] = (u128)a.qword[1] &lt;&lt; shift;\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvbsrl_v (__m256i a, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvbsrl_v (__m256i a, imm0_31 imm)\n#include <lasxintrin.h>\nInstruction: xvbsrl.v xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nCompute whole vector `a` shifted right by `imm * 8` bits.\n\n\n\n\n\n### Operation\n\n```c++\nint shift = (imm * 8) % 128;\ndst.qword[0] = (u128)a.qword[0] >> shift;\ndst.qword[1] = (u128)a.qword[1] >> shift;\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvbsrl_v (__m256i a, imm0_31 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvbsrl.v xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute whole vector <code>a</code> shifted right by <code>imm * 8</code> bits.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">int shift = (imm * 8) % 128;\ndst.qword[0] = (u128)a.qword[0] &gt;&gt; shift;\ndst.qword[1] = (u128)a.qword[1] &gt;&gt; shift;\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvclo_b (__m256i a)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvclo_b (__m256i a)\n#include <lasxintrin.h>\nInstruction: xvclo.b xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCount leading ones of 8-bit elements in `a`.\n\n\n\n\n### Examples\n\n```c++\n__m256i __lasx_xvclo_b(__m256i{ 0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee})\n= 0x0000000000000001 0x0101010202030800 0x0102030000000000 0x0101010102030203\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 32; i++) {\n  dst.byte[i] = clo(a.byte[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvclo_b (__m256i a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvclo.b xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Count leading ones of 8-bit elements in <code>a</code>.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvclo_b(__m256i{ 0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee})\n= 0x0000000000000001 0x0101010202030800 0x0102030000000000 0x0101010102030203\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 32; i++) {\n  dst.byte[i] = clo(a.byte[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Bitwise Operations", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvclo_d (__m256i a)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvclo_d (__m256i a)\n#include <lasxintrin.h>\nInstruction: xvclo.d xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCount leading ones of 64-bit elements in `a`.\n\n\n\n\n### Examples\n\n```c++\n__m256i __lasx_xvclo_d(__m256i{ 0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee})\n= 0x0000000000000000 0x0000000000000001 0x0000000000000001 0x0000000000000001\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = clo(a.dword[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvclo_d (__m256i a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvclo.d xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Count leading ones of 64-bit elements in <code>a</code>.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvclo_d(__m256i{ 0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee})\n= 0x0000000000000000 0x0000000000000001 0x0000000000000001 0x0000000000000001\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = clo(a.dword[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Bitwise Operations", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvclo_h (__m256i a)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvclo_h (__m256i a)\n#include <lasxintrin.h>\nInstruction: xvclo.h xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCount leading ones of 16-bit elements in `a`.\n\n\n\n\n### Examples\n\n```c++\n__m256i __lasx_xvclo_h(__m256i{ 0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee})\n= 0x0000000000000000 0x0001000100020008 0x0001000300000000 0x0001000100020002\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = clo(a.half[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvclo_h (__m256i a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvclo.h xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Count leading ones of 16-bit elements in <code>a</code>.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvclo_h(__m256i{ 0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee})\n= 0x0000000000000000 0x0001000100020008 0x0001000300000000 0x0001000100020002\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = clo(a.half[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Bitwise Operations", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvclo_w (__m256i a)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvclo_w (__m256i a)\n#include <lasxintrin.h>\nInstruction: xvclo.w xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCount leading ones of 32-bit elements in `a`.\n\n\n\n\n### Examples\n\n```c++\n__m256i __lasx_xvclo_w(__m256i{ 0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee})\n= 0x0000000000000000 0x0000000100000002 0x0000000100000000 0x0000000100000002\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = clo(a.word[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvclo_w (__m256i a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvclo.w xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Count leading ones of 32-bit elements in <code>a</code>.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvclo_w(__m256i{ 0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee})\n= 0x0000000000000000 0x0000000100000002 0x0000000100000000 0x0000000100000002\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = clo(a.word[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Bitwise Operations", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvclz_b (__m256i a)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvclz_b (__m256i a)\n#include <lasxintrin.h>\nInstruction: xvclz.b xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCount leading zeros of 8-bit elements in `a`.\n\n\n\n\n### Examples\n\n```c++\n__m256i __lasx_xvclz_b(__m256i{ 0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee})\n= 0x0302020101010100 0x0000000000000008 0x0000000303020302 0x0000000000000000\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 32; i++) {\n  dst.byte[i] = clz(a.byte[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvclz_b (__m256i a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvclz.b xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Count leading zeros of 8-bit elements in <code>a</code>.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvclz_b(__m256i{ 0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee})\n= 0x0302020101010100 0x0000000000000008 0x0000000303020302 0x0000000000000000\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 32; i++) {\n  dst.byte[i] = clz(a.byte[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Bitwise Operations", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvclz_d (__m256i a)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvclz_d (__m256i a)\n#include <lasxintrin.h>\nInstruction: xvclz.d xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCount leading zeros of 64-bit elements in `a`.\n\n\n\n\n### Examples\n\n```c++\n__m256i __lasx_xvclz_d(__m256i{ 0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee})\n= 0x0000000000000003 0x0000000000000000 0x0000000000000000 0x0000000000000000\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = clz(a.dword[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvclz_d (__m256i a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvclz.d xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Count leading zeros of 64-bit elements in <code>a</code>.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvclz_d(__m256i{ 0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee})\n= 0x0000000000000003 0x0000000000000000 0x0000000000000000 0x0000000000000000\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = clz(a.dword[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Bitwise Operations", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvclz_h (__m256i a)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvclz_h (__m256i a)\n#include <lasxintrin.h>\nInstruction: xvclz.h xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCount leading zeros of 16-bit elements in `a`.\n\n\n\n\n### Examples\n\n```c++\n__m256i __lasx_xvclz_h(__m256i{ 0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee})\n= 0x0003000200010001 0x0000000000000000 0x0000000000030003 0x0000000000000000\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = clz(a.half[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvclz_h (__m256i a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvclz.h xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Count leading zeros of 16-bit elements in <code>a</code>.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvclz_h(__m256i{ 0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee})\n= 0x0003000200010001 0x0000000000000000 0x0000000000030003 0x0000000000000000\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = clz(a.half[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Bitwise Operations", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvclz_w (__m256i a)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvclz_w (__m256i a)\n#include <lasxintrin.h>\nInstruction: xvclz.w xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCount leading zeros of 32-bit elements in `a`.\n\n\n\n\n### Examples\n\n```c++\n__m256i __lasx_xvclz_w(__m256i{ 0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee})\n= 0x0000000300000001 0x0000000000000000 0x0000000000000003 0x0000000000000000\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = clz(a.word[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvclz_w (__m256i a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvclz.w xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Count leading zeros of 32-bit elements in <code>a</code>.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvclz_w(__m256i{ 0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee})\n= 0x0000000300000001 0x0000000000000000 0x0000000000000003 0x0000000000000000\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = clz(a.word[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Bitwise Operations", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvdiv_b (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvdiv_b (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvdiv.b xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nDivide signed 8-bit elements in `a` by elements in `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 32; i++) {\n  dst.byte[i] = (b.byte[i] == 0) ? 0 : ((s8)a.byte[i] / (s8)b.byte[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 29, 32 | 0.06(1/15.5) |\n| 3C5000 | 32, 36 | 0.05(1/20.5) |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvdiv_b (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvdiv.b xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Divide signed 8-bit elements in <code>a</code> by elements in <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 32; i++) {\n  dst.byte[i] = (b.byte[i] == 0) ? 0 : ((s8)a.byte[i] / (s8)b.byte[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>29, 32</td>\n<td>0.06(1/15.5)</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>32, 36</td>\n<td>0.05(1/20.5)</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvdiv_bu (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvdiv_bu (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvdiv.bu xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nDivide unsigned 8-bit elements in `a` by elements in `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 32; i++) {\n  dst.byte[i] = (b.byte[i] == 0) ? 0 : ((u8)a.byte[i] / (u8)b.byte[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 29, 33 | 0.06(1/16.5) |\n| 3C5000 | 29, 36 | 0.05(1/20.5) |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvdiv_bu (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvdiv.bu xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Divide unsigned 8-bit elements in <code>a</code> by elements in <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 32; i++) {\n  dst.byte[i] = (b.byte[i] == 0) ? 0 : ((u8)a.byte[i] / (u8)b.byte[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>29, 33</td>\n<td>0.06(1/16.5)</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>29, 36</td>\n<td>0.05(1/20.5)</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvdiv_d (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvdiv_d (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvdiv.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nDivide signed 64-bit elements in `a` by elements in `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = (b.dword[i] == 0) ? 0 : ((s64)a.dword[i] / (s64)b.dword[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 8 | 0.25(1/4) |\n| 3C5000 | 8, 18.5 | 0.11(1/9) |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvdiv_d (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvdiv.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Divide signed 64-bit elements in <code>a</code> by elements in <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = (b.dword[i] == 0) ? 0 : ((s64)a.dword[i] / (s64)b.dword[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>8</td>\n<td>0.25(1/4)</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>8, 18.5</td>\n<td>0.11(1/9)</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvdiv_du (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvdiv_du (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvdiv.du xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nDivide unsigned 64-bit elements in `a` by elements in `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = (b.dword[i] == 0) ? 0 : ((u64)a.dword[i] / (u64)b.dword[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 8 | 0.25(1/4) |\n| 3C5000 | 8, 18.5 | 0.11(1/9) |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvdiv_du (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvdiv.du xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Divide unsigned 64-bit elements in <code>a</code> by elements in <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = (b.dword[i] == 0) ? 0 : ((u64)a.dword[i] / (u64)b.dword[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>8</td>\n<td>0.25(1/4)</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>8, 18.5</td>\n<td>0.11(1/9)</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvdiv_h (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvdiv_h (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvdiv.h xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nDivide signed 16-bit elements in `a` by elements in `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = (b.half[i] == 0) ? 0 : ((s16)a.half[i] / (s16)b.half[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 17 | 0.12(1/8.5) |\n| 3C5000 | 21.5, 22 | 0.08(1/13) |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvdiv_h (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvdiv.h xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Divide signed 16-bit elements in <code>a</code> by elements in <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = (b.half[i] == 0) ? 0 : ((s16)a.half[i] / (s16)b.half[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>17</td>\n<td>0.12(1/8.5)</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>21.5, 22</td>\n<td>0.08(1/13)</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvdiv_hu (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvdiv_hu (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvdiv.hu xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nDivide unsigned 16-bit elements in `a` by elements in `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = (b.half[i] == 0) ? 0 : ((u16)a.half[i] / (u16)b.half[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 17, 22 | 0.11(1/9) |\n| 3C5000 | 17, 21.5 | 0.07(1/15) |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvdiv_hu (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvdiv.hu xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Divide unsigned 16-bit elements in <code>a</code> by elements in <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = (b.half[i] == 0) ? 0 : ((u16)a.half[i] / (u16)b.half[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>17, 22</td>\n<td>0.11(1/9)</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>17, 21.5</td>\n<td>0.07(1/15)</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvdiv_w (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvdiv_w (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvdiv.w xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nDivide signed 32-bit elements in `a` by elements in `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = (b.word[i] == 0) ? 0 : ((s32)a.word[i] / (s32)b.word[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 11 | 0.18(1/5.5) |\n| 3C5000 | 11, 17.5 | 0.09(1/11.5) |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvdiv_w (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvdiv.w xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Divide signed 32-bit elements in <code>a</code> by elements in <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = (b.word[i] == 0) ? 0 : ((s32)a.word[i] / (s32)b.word[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>11</td>\n<td>0.18(1/5.5)</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>11, 17.5</td>\n<td>0.09(1/11.5)</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvdiv_wu (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvdiv_wu (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvdiv.wu xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nDivide unsigned 32-bit elements in `a` by elements in `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = (b.word[i] == 0) ? 0 : ((u32)a.word[i] / (u32)b.word[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 11 | 0.18(1/5.5) |\n| 3C5000 | 11, 17.5 | 0.07(1/15) |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvdiv_wu (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvdiv.wu xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Divide unsigned 32-bit elements in <code>a</code> by elements in <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = (b.word[i] == 0) ? 0 : ((u32)a.word[i] / (u32)b.word[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>11</td>\n<td>0.18(1/5.5)</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>11, 17.5</td>\n<td>0.07(1/15)</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvexth_d_w (__m256i a)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvexth_d_w (__m256i a)\n#include <lasxintrin.h>\nInstruction: xvexth.d.w xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nExtend signed 32-bit elements in the higher half of `a` to 64-bit.\n\n\n\n\n\n### Operation\n\n```c++\nint i;\nfor (i = 0; i < 2; i++) {\n  dst.dword[i] = (s64)(s32)a.word[2 + i];\n}\nfor (; i < 4; i++) {\n  dst.dword[i] = (s64)(s32)a.word[4 + i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvexth_d_w (__m256i a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvexth.d.w xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Extend signed 32-bit elements in the higher half of <code>a</code> to 64-bit.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">int i;\nfor (i = 0; i &lt; 2; i++) {\n  dst.dword[i] = (s64)(s32)a.word[2 + i];\n}\nfor (; i &lt; 4; i++) {\n  dst.dword[i] = (s64)(s32)a.word[4 + i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvexth_du_wu (__m256i a)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvexth_du_wu (__m256i a)\n#include <lasxintrin.h>\nInstruction: xvexth.du.wu xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nExtend unsigned 32-bit elements in the higher half of `a` to 64-bit.\n\n\n\n\n\n### Operation\n\n```c++\nint i;\nfor (i = 0; i < 2; i++) {\n  dst.dword[i] = (u64)(u32)a.word[2 + i];\n}\nfor (; i < 4; i++) {\n  dst.dword[i] = (u64)(u32)a.word[4 + i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvexth_du_wu (__m256i a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvexth.du.wu xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Extend unsigned 32-bit elements in the higher half of <code>a</code> to 64-bit.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">int i;\nfor (i = 0; i &lt; 2; i++) {\n  dst.dword[i] = (u64)(u32)a.word[2 + i];\n}\nfor (; i &lt; 4; i++) {\n  dst.dword[i] = (u64)(u32)a.word[4 + i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvexth_h_b (__m256i a)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvexth_h_b (__m256i a)\n#include <lasxintrin.h>\nInstruction: xvexth.h.b xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nExtend signed 8-bit elements in the higher half of `a` to 16-bit.\n\n\n\n\n\n### Operation\n\n```c++\nint i;\nfor (i = 0; i < 8; i++) {\n  dst.half[i] = (s16)(s8)a.byte[8 + i];\n}\nfor (; i < 16; i++) {\n  dst.half[i] = (s16)(s8)a.byte[16 + i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvexth_h_b (__m256i a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvexth.h.b xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Extend signed 8-bit elements in the higher half of <code>a</code> to 16-bit.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">int i;\nfor (i = 0; i &lt; 8; i++) {\n  dst.half[i] = (s16)(s8)a.byte[8 + i];\n}\nfor (; i &lt; 16; i++) {\n  dst.half[i] = (s16)(s8)a.byte[16 + i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvexth_hu_bu (__m256i a)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvexth_hu_bu (__m256i a)\n#include <lasxintrin.h>\nInstruction: xvexth.hu.bu xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nExtend unsigned 8-bit elements in the higher half of `a` to 16-bit.\n\n\n\n\n\n### Operation\n\n```c++\nint i;\nfor (i = 0; i < 8; i++) {\n  dst.half[i] = (u16)(u8)a.byte[8 + i];\n}\nfor (; i < 16; i++) {\n  dst.half[i] = (u16)(u8)a.byte[16 + i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvexth_hu_bu (__m256i a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvexth.hu.bu xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Extend unsigned 8-bit elements in the higher half of <code>a</code> to 16-bit.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">int i;\nfor (i = 0; i &lt; 8; i++) {\n  dst.half[i] = (u16)(u8)a.byte[8 + i];\n}\nfor (; i &lt; 16; i++) {\n  dst.half[i] = (u16)(u8)a.byte[16 + i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvexth_q_d (__m256i a)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvexth_q_d (__m256i a)\n#include <lasxintrin.h>\nInstruction: xvexth.q.d xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nExtend signed 64-bit elements in the higher half of `a` to 128-bit.\n\n\n\n\n\n### Operation\n\n```c++\nint i;\nfor (i = 0; i < 1; i++) {\n  dst.qword[i] = (s128)(s64)a.dword[1 + i];\n}\nfor (; i < 2; i++) {\n  dst.qword[i] = (s128)(s64)a.dword[2 + i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvexth_q_d (__m256i a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvexth.q.d xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Extend signed 64-bit elements in the higher half of <code>a</code> to 128-bit.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">int i;\nfor (i = 0; i &lt; 1; i++) {\n  dst.qword[i] = (s128)(s64)a.dword[1 + i];\n}\nfor (; i &lt; 2; i++) {\n  dst.qword[i] = (s128)(s64)a.dword[2 + i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvexth_qu_du (__m256i a)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvexth_qu_du (__m256i a)\n#include <lasxintrin.h>\nInstruction: xvexth.qu.du xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nExtend unsigned 64-bit elements in the higher half of `a` to 128-bit.\n\n\n\n\n\n### Operation\n\n```c++\nint i;\nfor (i = 0; i < 1; i++) {\n  dst.qword[i] = (u128)(u64)a.dword[1 + i];\n}\nfor (; i < 2; i++) {\n  dst.qword[i] = (u128)(u64)a.dword[2 + i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvexth_qu_du (__m256i a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvexth.qu.du xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Extend unsigned 64-bit elements in the higher half of <code>a</code> to 128-bit.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">int i;\nfor (i = 0; i &lt; 1; i++) {\n  dst.qword[i] = (u128)(u64)a.dword[1 + i];\n}\nfor (; i &lt; 2; i++) {\n  dst.qword[i] = (u128)(u64)a.dword[2 + i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvexth_w_h (__m256i a)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvexth_w_h (__m256i a)\n#include <lasxintrin.h>\nInstruction: xvexth.w.h xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nExtend signed 16-bit elements in the higher half of `a` to 32-bit.\n\n\n\n\n\n### Operation\n\n```c++\nint i;\nfor (i = 0; i < 4; i++) {\n  dst.word[i] = (s32)(s16)a.half[4 + i];\n}\nfor (; i < 8; i++) {\n  dst.word[i] = (s32)(s16)a.half[8 + i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvexth_w_h (__m256i a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvexth.w.h xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Extend signed 16-bit elements in the higher half of <code>a</code> to 32-bit.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">int i;\nfor (i = 0; i &lt; 4; i++) {\n  dst.word[i] = (s32)(s16)a.half[4 + i];\n}\nfor (; i &lt; 8; i++) {\n  dst.word[i] = (s32)(s16)a.half[8 + i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvexth_wu_hu (__m256i a)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvexth_wu_hu (__m256i a)\n#include <lasxintrin.h>\nInstruction: xvexth.wu.hu xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nExtend unsigned 16-bit elements in the higher half of `a` to 32-bit.\n\n\n\n\n\n### Operation\n\n```c++\nint i;\nfor (i = 0; i < 4; i++) {\n  dst.word[i] = (u32)(u16)a.half[4 + i];\n}\nfor (; i < 8; i++) {\n  dst.word[i] = (u32)(u16)a.half[8 + i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvexth_wu_hu (__m256i a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvexth.wu.hu xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Extend unsigned 16-bit elements in the higher half of <code>a</code> to 32-bit.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">int i;\nfor (i = 0; i &lt; 4; i++) {\n  dst.word[i] = (u32)(u16)a.half[4 + i];\n}\nfor (; i &lt; 8; i++) {\n  dst.word[i] = (u32)(u16)a.half[8 + i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvextl_q_d (__m256i a)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvextl_q_d (__m256i a)\n#include <lasxintrin.h>\nInstruction: xvextl.q.d xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nExtend signed 64-bit elements in the lower half of `a` to 128-bit.\n\n\n\n\n\n### Operation\n\n```c++\nint i;\nfor (i = 0; i < 1; i++) {\n  dst.qword[i] = (s128)(s64)a.dword[i];\n}\nfor (; i < 2; i++) {\n  dst.qword[i] = (s128)(s64)a.dword[i + 1];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvextl_q_d (__m256i a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvextl.q.d xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Extend signed 64-bit elements in the lower half of <code>a</code> to 128-bit.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">int i;\nfor (i = 0; i &lt; 1; i++) {\n  dst.qword[i] = (s128)(s64)a.dword[i];\n}\nfor (; i &lt; 2; i++) {\n  dst.qword[i] = (s128)(s64)a.dword[i + 1];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvextl_qu_du (__m256i a)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvextl_qu_du (__m256i a)\n#include <lasxintrin.h>\nInstruction: xvextl.qu.du xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nExtend unsigned 64-bit elements in the lower half of `a` to 128-bit.\n\n\n\n\n\n### Operation\n\n```c++\nint i;\nfor (i = 0; i < 1; i++) {\n  dst.qword[i] = (u128)(u64)a.dword[i];\n}\nfor (; i < 2; i++) {\n  dst.qword[i] = (u128)(u64)a.dword[i + 1];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvextl_qu_du (__m256i a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvextl.qu.du xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Extend unsigned 64-bit elements in the lower half of <code>a</code> to 128-bit.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">int i;\nfor (i = 0; i &lt; 1; i++) {\n  dst.qword[i] = (u128)(u64)a.dword[i];\n}\nfor (; i &lt; 2; i++) {\n  dst.qword[i] = (u128)(u64)a.dword[i + 1];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvextrins_b (__m256i a, __m256i b, imm0_255 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvextrins_b (__m256i a, __m256i b, imm0_255 imm)\n#include <lasxintrin.h>\nInstruction: xvextrins.b xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nExtract one 8-bit element in `b` and insert it to `a` according to `imm`.\n\n\n\n\n\n### Operation\n\n```c++\nint i;\nfor (i = 0; i < 16; i++) {\n  dst.byte[i] = (i == ((imm >> 4) & 15)) ? b.byte[imm & 15] : a.byte[i];\n}\nfor (; i < 32; i++) {\n  dst.byte[i] =\n      (i - 16 == ((imm >> 4) & 15)) ? b.byte[(imm & 15) + 16] : a.byte[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvextrins_b (__m256i a, __m256i b, imm0_255 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvextrins.b xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Extract one 8-bit element in <code>b</code> and insert it to <code>a</code> according to <code>imm</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">int i;\nfor (i = 0; i &lt; 16; i++) {\n  dst.byte[i] = (i == ((imm &gt;&gt; 4) &amp; 15)) ? b.byte[imm &amp; 15] : a.byte[i];\n}\nfor (; i &lt; 32; i++) {\n  dst.byte[i] =\n      (i - 16 == ((imm &gt;&gt; 4) &amp; 15)) ? b.byte[(imm &amp; 15) + 16] : a.byte[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvextrins_d (__m256i a, __m256i b, imm0_255 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvextrins_d (__m256i a, __m256i b, imm0_255 imm)\n#include <lasxintrin.h>\nInstruction: xvextrins.d xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nExtract one 64-bit element in `b` and insert it to `a` according to `imm`.\n\n\n\n\n\n### Operation\n\n```c++\nint i;\nfor (i = 0; i < 2; i++) {\n  dst.dword[i] = (i == ((imm >> 4) & 1)) ? b.dword[imm & 1] : a.dword[i];\n}\nfor (; i < 4; i++) {\n  dst.dword[i] =\n      (i - 2 == ((imm >> 4) & 1)) ? b.dword[(imm & 1) + 2] : a.dword[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvextrins_d (__m256i a, __m256i b, imm0_255 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvextrins.d xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Extract one 64-bit element in <code>b</code> and insert it to <code>a</code> according to <code>imm</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">int i;\nfor (i = 0; i &lt; 2; i++) {\n  dst.dword[i] = (i == ((imm &gt;&gt; 4) &amp; 1)) ? b.dword[imm &amp; 1] : a.dword[i];\n}\nfor (; i &lt; 4; i++) {\n  dst.dword[i] =\n      (i - 2 == ((imm &gt;&gt; 4) &amp; 1)) ? b.dword[(imm &amp; 1) + 2] : a.dword[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvextrins_h (__m256i a, __m256i b, imm0_255 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvextrins_h (__m256i a, __m256i b, imm0_255 imm)\n#include <lasxintrin.h>\nInstruction: xvextrins.h xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nExtract one 16-bit element in `b` and insert it to `a` according to `imm`.\n\n\n\n\n\n### Operation\n\n```c++\nint i;\nfor (i = 0; i < 8; i++) {\n  dst.half[i] = (i == ((imm >> 4) & 7)) ? b.half[imm & 7] : a.half[i];\n}\nfor (; i < 16; i++) {\n  dst.half[i] = (i - 8 == ((imm >> 4) & 7)) ? b.half[(imm & 7) + 8] : a.half[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvextrins_h (__m256i a, __m256i b, imm0_255 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvextrins.h xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Extract one 16-bit element in <code>b</code> and insert it to <code>a</code> according to <code>imm</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">int i;\nfor (i = 0; i &lt; 8; i++) {\n  dst.half[i] = (i == ((imm &gt;&gt; 4) &amp; 7)) ? b.half[imm &amp; 7] : a.half[i];\n}\nfor (; i &lt; 16; i++) {\n  dst.half[i] = (i - 8 == ((imm &gt;&gt; 4) &amp; 7)) ? b.half[(imm &amp; 7) + 8] : a.half[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvextrins_w (__m256i a, __m256i b, imm0_255 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvextrins_w (__m256i a, __m256i b, imm0_255 imm)\n#include <lasxintrin.h>\nInstruction: xvextrins.w xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nExtract one 32-bit element in `b` and insert it to `a` according to `imm`.\n\n\n\n\n\n### Operation\n\n```c++\nint i;\nfor (i = 0; i < 4; i++) {\n  dst.word[i] = (i == ((imm >> 4) & 3)) ? b.word[imm & 3] : a.word[i];\n}\nfor (; i < 8; i++) {\n  dst.word[i] = (i - 4 == ((imm >> 4) & 3)) ? b.word[(imm & 3) + 4] : a.word[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvextrins_w (__m256i a, __m256i b, imm0_255 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvextrins.w xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Extract one 32-bit element in <code>b</code> and insert it to <code>a</code> according to <code>imm</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">int i;\nfor (i = 0; i &lt; 4; i++) {\n  dst.word[i] = (i == ((imm &gt;&gt; 4) &amp; 3)) ? b.word[imm &amp; 3] : a.word[i];\n}\nfor (; i &lt; 8; i++) {\n  dst.word[i] = (i - 4 == ((imm &gt;&gt; 4) &amp; 3)) ? b.word[(imm &amp; 3) + 4] : a.word[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvfclass_d (__m256d a)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvfclass_d (__m256d a)\n#include <lasxintrin.h>\nInstruction: xvfclass.d xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nClassifiy each double precision floating point elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = fp_classify(a.fp64[i]);\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvfclass_d (__m256d a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfclass.d xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Classifiy each double precision floating point elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = fp_classify(a.fp64[i]);\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvfclass_s (__m256 a)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvfclass_s (__m256 a)\n#include <lasxintrin.h>\nInstruction: xvfclass.s xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nClassifiy each single precision floating point elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.word[i] = fp_classify(a.fp32[i]);\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvfclass_s (__m256 a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfclass.s xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Classifiy each single precision floating point elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.word[i] = fp_classify(a.fp32[i]);\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvfcmp_caf_d (__m256d a, __m256d b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvfcmp_caf_d (__m256d a, __m256d b)\n#include <lasxintrin.h>\nInstruction: xvfcmp.caf.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompare double precision elements in `a` and `b`, save the comparison result (all ones if AF(Always False), all zeros otherwise) into `dst`. Do not trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (fp_compare_caf(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvfcmp_caf_d (__m256d a, __m256d b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfcmp.caf.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if AF(Always False), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (fp_compare_caf(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvfcmp_caf_s (__m256 a, __m256 b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvfcmp_caf_s (__m256 a, __m256 b)\n#include <lasxintrin.h>\nInstruction: xvfcmp.caf.s xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompare single precision elements in `a` and `b`, save the comparison result (all ones if AF(Always False), all zeros otherwise) into `dst`. Do not trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  if (fp_compare_caf(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvfcmp_caf_s (__m256 a, __m256 b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfcmp.caf.s xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if AF(Always False), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  if (fp_compare_caf(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvfcmp_ceq_d (__m256d a, __m256d b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvfcmp_ceq_d (__m256d a, __m256d b)\n#include <lasxintrin.h>\nInstruction: xvfcmp.ceq.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompare double precision elements in `a` and `b`, save the comparison result (all ones if EQ(Equal), all zeros otherwise) into `dst`. Do not trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (fp_compare_ceq(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvfcmp_ceq_d (__m256d a, __m256d b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfcmp.ceq.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if EQ(Equal), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (fp_compare_ceq(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvfcmp_ceq_s (__m256 a, __m256 b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvfcmp_ceq_s (__m256 a, __m256 b)\n#include <lasxintrin.h>\nInstruction: xvfcmp.ceq.s xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompare single precision elements in `a` and `b`, save the comparison result (all ones if EQ(Equal), all zeros otherwise) into `dst`. Do not trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  if (fp_compare_ceq(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvfcmp_ceq_s (__m256 a, __m256 b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfcmp.ceq.s xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if EQ(Equal), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  if (fp_compare_ceq(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvfcmp_cle_d (__m256d a, __m256d b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvfcmp_cle_d (__m256d a, __m256d b)\n#include <lasxintrin.h>\nInstruction: xvfcmp.cle.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompare double precision elements in `a` and `b`, save the comparison result (all ones if LE(Less than or Equal), all zeros otherwise) into `dst`. Do not trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (fp_compare_cle(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvfcmp_cle_d (__m256d a, __m256d b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfcmp.cle.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if LE(Less than or Equal), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (fp_compare_cle(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvfcmp_cle_s (__m256 a, __m256 b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvfcmp_cle_s (__m256 a, __m256 b)\n#include <lasxintrin.h>\nInstruction: xvfcmp.cle.s xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompare single precision elements in `a` and `b`, save the comparison result (all ones if LE(Less than or Equal), all zeros otherwise) into `dst`. Do not trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  if (fp_compare_cle(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvfcmp_cle_s (__m256 a, __m256 b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfcmp.cle.s xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if LE(Less than or Equal), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  if (fp_compare_cle(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvfcmp_clt_d (__m256d a, __m256d b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvfcmp_clt_d (__m256d a, __m256d b)\n#include <lasxintrin.h>\nInstruction: xvfcmp.clt.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompare double precision elements in `a` and `b`, save the comparison result (all ones if LT(Less than), all zeros otherwise) into `dst`. Do not trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (fp_compare_clt(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvfcmp_clt_d (__m256d a, __m256d b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfcmp.clt.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if LT(Less than), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (fp_compare_clt(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvfcmp_clt_s (__m256 a, __m256 b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvfcmp_clt_s (__m256 a, __m256 b)\n#include <lasxintrin.h>\nInstruction: xvfcmp.clt.s xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompare single precision elements in `a` and `b`, save the comparison result (all ones if LT(Less than), all zeros otherwise) into `dst`. Do not trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  if (fp_compare_clt(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvfcmp_clt_s (__m256 a, __m256 b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfcmp.clt.s xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if LT(Less than), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  if (fp_compare_clt(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvfcmp_cne_d (__m256d a, __m256d b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvfcmp_cne_d (__m256d a, __m256d b)\n#include <lasxintrin.h>\nInstruction: xvfcmp.cne.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompare double precision elements in `a` and `b`, save the comparison result (all ones if NE(Not Equal), all zeros otherwise) into `dst`. Do not trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (fp_compare_cne(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvfcmp_cne_d (__m256d a, __m256d b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfcmp.cne.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if NE(Not Equal), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (fp_compare_cne(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvfcmp_cne_s (__m256 a, __m256 b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvfcmp_cne_s (__m256 a, __m256 b)\n#include <lasxintrin.h>\nInstruction: xvfcmp.cne.s xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompare single precision elements in `a` and `b`, save the comparison result (all ones if NE(Not Equal), all zeros otherwise) into `dst`. Do not trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  if (fp_compare_cne(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvfcmp_cne_s (__m256 a, __m256 b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfcmp.cne.s xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if NE(Not Equal), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  if (fp_compare_cne(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvfcmp_cor_d (__m256d a, __m256d b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvfcmp_cor_d (__m256d a, __m256d b)\n#include <lasxintrin.h>\nInstruction: xvfcmp.cor.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompare double precision elements in `a` and `b`, save the comparison result (all ones if OR(Ordered), all zeros otherwise) into `dst`. Do not trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (fp_compare_cor(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvfcmp_cor_d (__m256d a, __m256d b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfcmp.cor.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if OR(Ordered), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (fp_compare_cor(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvfcmp_cor_s (__m256 a, __m256 b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvfcmp_cor_s (__m256 a, __m256 b)\n#include <lasxintrin.h>\nInstruction: xvfcmp.cor.s xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompare single precision elements in `a` and `b`, save the comparison result (all ones if OR(Ordered), all zeros otherwise) into `dst`. Do not trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  if (fp_compare_cor(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvfcmp_cor_s (__m256 a, __m256 b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfcmp.cor.s xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if OR(Ordered), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  if (fp_compare_cor(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvfcmp_cueq_d (__m256d a, __m256d b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvfcmp_cueq_d (__m256d a, __m256d b)\n#include <lasxintrin.h>\nInstruction: xvfcmp.cueq.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompare double precision elements in `a` and `b`, save the comparison result (all ones if UEQ(Unordered or Equal), all zeros otherwise) into `dst`. Do not trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (fp_compare_cueq(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvfcmp_cueq_d (__m256d a, __m256d b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfcmp.cueq.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if UEQ(Unordered or Equal), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (fp_compare_cueq(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvfcmp_cueq_s (__m256 a, __m256 b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvfcmp_cueq_s (__m256 a, __m256 b)\n#include <lasxintrin.h>\nInstruction: xvfcmp.cueq.s xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompare single precision elements in `a` and `b`, save the comparison result (all ones if UEQ(Unordered or Equal), all zeros otherwise) into `dst`. Do not trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  if (fp_compare_cueq(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvfcmp_cueq_s (__m256 a, __m256 b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfcmp.cueq.s xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if UEQ(Unordered or Equal), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  if (fp_compare_cueq(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvfcmp_cule_d (__m256d a, __m256d b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvfcmp_cule_d (__m256d a, __m256d b)\n#include <lasxintrin.h>\nInstruction: xvfcmp.cule.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompare double precision elements in `a` and `b`, save the comparison result (all ones if ULE(Unordered, Less than or Equal), all zeros otherwise) into `dst`. Do not trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (fp_compare_cule(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvfcmp_cule_d (__m256d a, __m256d b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfcmp.cule.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if ULE(Unordered, Less than or Equal), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (fp_compare_cule(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvfcmp_cule_s (__m256 a, __m256 b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvfcmp_cule_s (__m256 a, __m256 b)\n#include <lasxintrin.h>\nInstruction: xvfcmp.cule.s xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompare single precision elements in `a` and `b`, save the comparison result (all ones if ULE(Unordered, Less than or Equal), all zeros otherwise) into `dst`. Do not trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  if (fp_compare_cule(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvfcmp_cule_s (__m256 a, __m256 b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfcmp.cule.s xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if ULE(Unordered, Less than or Equal), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  if (fp_compare_cule(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvfcmp_cult_d (__m256d a, __m256d b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvfcmp_cult_d (__m256d a, __m256d b)\n#include <lasxintrin.h>\nInstruction: xvfcmp.cult.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompare double precision elements in `a` and `b`, save the comparison result (all ones if ULT(Unordered or Less than), all zeros otherwise) into `dst`. Do not trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (fp_compare_cult(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvfcmp_cult_d (__m256d a, __m256d b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfcmp.cult.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if ULT(Unordered or Less than), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (fp_compare_cult(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvfcmp_cult_s (__m256 a, __m256 b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvfcmp_cult_s (__m256 a, __m256 b)\n#include <lasxintrin.h>\nInstruction: xvfcmp.cult.s xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompare single precision elements in `a` and `b`, save the comparison result (all ones if ULT(Unordered or Less than), all zeros otherwise) into `dst`. Do not trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  if (fp_compare_cult(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvfcmp_cult_s (__m256 a, __m256 b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfcmp.cult.s xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if ULT(Unordered or Less than), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  if (fp_compare_cult(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvfcmp_cun_d (__m256d a, __m256d b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvfcmp_cun_d (__m256d a, __m256d b)\n#include <lasxintrin.h>\nInstruction: xvfcmp.cun.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompare double precision elements in `a` and `b`, save the comparison result (all ones if UN(Unordered), all zeros otherwise) into `dst`. Do not trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (fp_compare_cun(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvfcmp_cun_d (__m256d a, __m256d b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfcmp.cun.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if UN(Unordered), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (fp_compare_cun(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvfcmp_cun_s (__m256 a, __m256 b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvfcmp_cun_s (__m256 a, __m256 b)\n#include <lasxintrin.h>\nInstruction: xvfcmp.cun.s xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompare single precision elements in `a` and `b`, save the comparison result (all ones if UN(Unordered), all zeros otherwise) into `dst`. Do not trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  if (fp_compare_cun(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvfcmp_cun_s (__m256 a, __m256 b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfcmp.cun.s xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if UN(Unordered), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  if (fp_compare_cun(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvfcmp_cune_d (__m256d a, __m256d b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvfcmp_cune_d (__m256d a, __m256d b)\n#include <lasxintrin.h>\nInstruction: xvfcmp.cune.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompare double precision elements in `a` and `b`, save the comparison result (all ones if UNE(Unordered or Not Equal), all zeros otherwise) into `dst`. Do not trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (fp_compare_cune(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvfcmp_cune_d (__m256d a, __m256d b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfcmp.cune.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if UNE(Unordered or Not Equal), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (fp_compare_cune(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvfcmp_cune_s (__m256 a, __m256 b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvfcmp_cune_s (__m256 a, __m256 b)\n#include <lasxintrin.h>\nInstruction: xvfcmp.cune.s xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompare single precision elements in `a` and `b`, save the comparison result (all ones if UNE(Unordered or Not Equal), all zeros otherwise) into `dst`. Do not trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  if (fp_compare_cune(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvfcmp_cune_s (__m256 a, __m256 b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfcmp.cune.s xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if UNE(Unordered or Not Equal), all zeros otherwise) into <code>dst</code>. Do not trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  if (fp_compare_cune(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvfcmp_saf_d (__m256d a, __m256d b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvfcmp_saf_d (__m256d a, __m256d b)\n#include <lasxintrin.h>\nInstruction: xvfcmp.saf.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompare double precision elements in `a` and `b`, save the comparison result (all ones if AF(Always False), all zeros otherwise) into `dst`. Trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (fp_compare_saf(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvfcmp_saf_d (__m256d a, __m256d b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfcmp.saf.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if AF(Always False), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (fp_compare_saf(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvfcmp_saf_s (__m256 a, __m256 b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvfcmp_saf_s (__m256 a, __m256 b)\n#include <lasxintrin.h>\nInstruction: xvfcmp.saf.s xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompare single precision elements in `a` and `b`, save the comparison result (all ones if AF(Always False), all zeros otherwise) into `dst`. Trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  if (fp_compare_saf(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvfcmp_saf_s (__m256 a, __m256 b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfcmp.saf.s xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if AF(Always False), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  if (fp_compare_saf(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvfcmp_seq_d (__m256d a, __m256d b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvfcmp_seq_d (__m256d a, __m256d b)\n#include <lasxintrin.h>\nInstruction: xvfcmp.seq.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompare double precision elements in `a` and `b`, save the comparison result (all ones if EQ(Equal), all zeros otherwise) into `dst`. Trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (fp_compare_seq(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvfcmp_seq_d (__m256d a, __m256d b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfcmp.seq.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if EQ(Equal), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (fp_compare_seq(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvfcmp_seq_s (__m256 a, __m256 b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvfcmp_seq_s (__m256 a, __m256 b)\n#include <lasxintrin.h>\nInstruction: xvfcmp.seq.s xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompare single precision elements in `a` and `b`, save the comparison result (all ones if EQ(Equal), all zeros otherwise) into `dst`. Trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  if (fp_compare_seq(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvfcmp_seq_s (__m256 a, __m256 b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfcmp.seq.s xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if EQ(Equal), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  if (fp_compare_seq(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvfcmp_sle_d (__m256d a, __m256d b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvfcmp_sle_d (__m256d a, __m256d b)\n#include <lasxintrin.h>\nInstruction: xvfcmp.sle.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompare double precision elements in `a` and `b`, save the comparison result (all ones if LE(Less than or Equal), all zeros otherwise) into `dst`. Trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (fp_compare_sle(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvfcmp_sle_d (__m256d a, __m256d b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfcmp.sle.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if LE(Less than or Equal), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (fp_compare_sle(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvfcmp_sle_s (__m256 a, __m256 b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvfcmp_sle_s (__m256 a, __m256 b)\n#include <lasxintrin.h>\nInstruction: xvfcmp.sle.s xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompare single precision elements in `a` and `b`, save the comparison result (all ones if LE(Less than or Equal), all zeros otherwise) into `dst`. Trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  if (fp_compare_sle(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvfcmp_sle_s (__m256 a, __m256 b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfcmp.sle.s xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if LE(Less than or Equal), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  if (fp_compare_sle(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvfcmp_slt_d (__m256d a, __m256d b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvfcmp_slt_d (__m256d a, __m256d b)\n#include <lasxintrin.h>\nInstruction: xvfcmp.slt.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompare double precision elements in `a` and `b`, save the comparison result (all ones if LT(Less than), all zeros otherwise) into `dst`. Trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (fp_compare_slt(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvfcmp_slt_d (__m256d a, __m256d b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfcmp.slt.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if LT(Less than), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (fp_compare_slt(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvfcmp_slt_s (__m256 a, __m256 b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvfcmp_slt_s (__m256 a, __m256 b)\n#include <lasxintrin.h>\nInstruction: xvfcmp.slt.s xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompare single precision elements in `a` and `b`, save the comparison result (all ones if LT(Less than), all zeros otherwise) into `dst`. Trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  if (fp_compare_slt(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvfcmp_slt_s (__m256 a, __m256 b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfcmp.slt.s xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if LT(Less than), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  if (fp_compare_slt(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvfcmp_sne_d (__m256d a, __m256d b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvfcmp_sne_d (__m256d a, __m256d b)\n#include <lasxintrin.h>\nInstruction: xvfcmp.sne.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompare double precision elements in `a` and `b`, save the comparison result (all ones if NE(Not Equal), all zeros otherwise) into `dst`. Trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (fp_compare_sne(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvfcmp_sne_d (__m256d a, __m256d b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfcmp.sne.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if NE(Not Equal), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (fp_compare_sne(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvfcmp_sne_s (__m256 a, __m256 b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvfcmp_sne_s (__m256 a, __m256 b)\n#include <lasxintrin.h>\nInstruction: xvfcmp.sne.s xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompare single precision elements in `a` and `b`, save the comparison result (all ones if NE(Not Equal), all zeros otherwise) into `dst`. Trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  if (fp_compare_sne(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvfcmp_sne_s (__m256 a, __m256 b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfcmp.sne.s xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if NE(Not Equal), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  if (fp_compare_sne(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvfcmp_sor_d (__m256d a, __m256d b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvfcmp_sor_d (__m256d a, __m256d b)\n#include <lasxintrin.h>\nInstruction: xvfcmp.sor.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompare double precision elements in `a` and `b`, save the comparison result (all ones if OR(Ordered), all zeros otherwise) into `dst`. Trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (fp_compare_sor(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvfcmp_sor_d (__m256d a, __m256d b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfcmp.sor.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if OR(Ordered), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (fp_compare_sor(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvfcmp_sor_s (__m256 a, __m256 b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvfcmp_sor_s (__m256 a, __m256 b)\n#include <lasxintrin.h>\nInstruction: xvfcmp.sor.s xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompare single precision elements in `a` and `b`, save the comparison result (all ones if OR(Ordered), all zeros otherwise) into `dst`. Trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  if (fp_compare_sor(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvfcmp_sor_s (__m256 a, __m256 b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfcmp.sor.s xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if OR(Ordered), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  if (fp_compare_sor(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvfcmp_sueq_d (__m256d a, __m256d b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvfcmp_sueq_d (__m256d a, __m256d b)\n#include <lasxintrin.h>\nInstruction: xvfcmp.sueq.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompare double precision elements in `a` and `b`, save the comparison result (all ones if UEQ(Unordered or Equal), all zeros otherwise) into `dst`. Trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (fp_compare_sueq(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvfcmp_sueq_d (__m256d a, __m256d b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfcmp.sueq.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if UEQ(Unordered or Equal), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (fp_compare_sueq(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvfcmp_sueq_s (__m256 a, __m256 b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvfcmp_sueq_s (__m256 a, __m256 b)\n#include <lasxintrin.h>\nInstruction: xvfcmp.sueq.s xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompare single precision elements in `a` and `b`, save the comparison result (all ones if UEQ(Unordered or Equal), all zeros otherwise) into `dst`. Trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  if (fp_compare_sueq(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvfcmp_sueq_s (__m256 a, __m256 b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfcmp.sueq.s xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if UEQ(Unordered or Equal), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  if (fp_compare_sueq(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvfcmp_sule_d (__m256d a, __m256d b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvfcmp_sule_d (__m256d a, __m256d b)\n#include <lasxintrin.h>\nInstruction: xvfcmp.sule.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompare double precision elements in `a` and `b`, save the comparison result (all ones if ULE(Unordered, Less than or Equal), all zeros otherwise) into `dst`. Trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (fp_compare_sule(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvfcmp_sule_d (__m256d a, __m256d b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfcmp.sule.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if ULE(Unordered, Less than or Equal), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (fp_compare_sule(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvfcmp_sule_s (__m256 a, __m256 b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvfcmp_sule_s (__m256 a, __m256 b)\n#include <lasxintrin.h>\nInstruction: xvfcmp.sule.s xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompare single precision elements in `a` and `b`, save the comparison result (all ones if ULE(Unordered, Less than or Equal), all zeros otherwise) into `dst`. Trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  if (fp_compare_sule(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvfcmp_sule_s (__m256 a, __m256 b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfcmp.sule.s xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if ULE(Unordered, Less than or Equal), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  if (fp_compare_sule(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvfcmp_sult_d (__m256d a, __m256d b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvfcmp_sult_d (__m256d a, __m256d b)\n#include <lasxintrin.h>\nInstruction: xvfcmp.sult.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompare double precision elements in `a` and `b`, save the comparison result (all ones if ULT(Unordered or Less than), all zeros otherwise) into `dst`. Trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (fp_compare_sult(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvfcmp_sult_d (__m256d a, __m256d b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfcmp.sult.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if ULT(Unordered or Less than), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (fp_compare_sult(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvfcmp_sult_s (__m256 a, __m256 b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvfcmp_sult_s (__m256 a, __m256 b)\n#include <lasxintrin.h>\nInstruction: xvfcmp.sult.s xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompare single precision elements in `a` and `b`, save the comparison result (all ones if ULT(Unordered or Less than), all zeros otherwise) into `dst`. Trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  if (fp_compare_sult(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvfcmp_sult_s (__m256 a, __m256 b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfcmp.sult.s xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if ULT(Unordered or Less than), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  if (fp_compare_sult(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvfcmp_sun_d (__m256d a, __m256d b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvfcmp_sun_d (__m256d a, __m256d b)\n#include <lasxintrin.h>\nInstruction: xvfcmp.sun.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompare double precision elements in `a` and `b`, save the comparison result (all ones if UN(Unordered), all zeros otherwise) into `dst`. Trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (fp_compare_sun(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvfcmp_sun_d (__m256d a, __m256d b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfcmp.sun.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if UN(Unordered), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (fp_compare_sun(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvfcmp_sun_s (__m256 a, __m256 b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvfcmp_sun_s (__m256 a, __m256 b)\n#include <lasxintrin.h>\nInstruction: xvfcmp.sun.s xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompare single precision elements in `a` and `b`, save the comparison result (all ones if UN(Unordered), all zeros otherwise) into `dst`. Trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  if (fp_compare_sun(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvfcmp_sun_s (__m256 a, __m256 b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfcmp.sun.s xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if UN(Unordered), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  if (fp_compare_sun(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvfcmp_sune_d (__m256d a, __m256d b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvfcmp_sune_d (__m256d a, __m256d b)\n#include <lasxintrin.h>\nInstruction: xvfcmp.sune.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompare double precision elements in `a` and `b`, save the comparison result (all ones if UNE(Unordered or Not Equal), all zeros otherwise) into `dst`. Trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (fp_compare_sune(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvfcmp_sune_d (__m256d a, __m256d b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfcmp.sune.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare double precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if UNE(Unordered or Not Equal), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (fp_compare_sune(a.fp64[i], b.fp64[i])) {\n    dst.dword[i] = 0xFFFFFFFFFFFFFFFF;\n  } else {\n    dst.dword[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvfcmp_sune_s (__m256 a, __m256 b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvfcmp_sune_s (__m256 a, __m256 b)\n#include <lasxintrin.h>\nInstruction: xvfcmp.sune.s xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompare single precision elements in `a` and `b`, save the comparison result (all ones if UNE(Unordered or Not Equal), all zeros otherwise) into `dst`. Trap for QNaN.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  if (fp_compare_sune(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvfcmp_sune_s (__m256 a, __m256 b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfcmp.sune.s xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare single precision elements in <code>a</code> and <code>b</code>, save the comparison result (all ones if UNE(Unordered or Not Equal), all zeros otherwise) into <code>dst</code>. Trap for QNaN.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  if (fp_compare_sune(a.fp32[i], b.fp32[i])) {\n    dst.word[i] = 0xFFFFFFFF;\n  } else {\n    dst.word[i] = 0;\n  }\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Comparison", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvfcvt_h_s (__m256 a, __m256 b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvfcvt_h_s (__m256 a, __m256 b)\n#include <lasxintrin.h>\nInstruction: xvfcvt.h.s xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nConvert single precision floating point elements in `a` and `b` to half precision.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  if (i < 8) {\n    dst.fp16[i] = b.fp32[i];\n  } else {\n    dst.fp16[i] = a.fp32[i - 8];\n  }\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvfcvt_h_s (__m256 a, __m256 b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfcvt.h.s xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Convert single precision floating point elements in <code>a</code> and <code>b</code> to half precision.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  if (i &lt; 8) {\n    dst.fp16[i] = b.fp32[i];\n  } else {\n    dst.fp16[i] = a.fp32[i - 8];\n  }\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Conversion", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvfrstp_b (__m256i a, __m256i b, __m256i c)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvfrstp_b (__m256i a, __m256i b, __m256i c)\n#include <lasxintrin.h>\nInstruction: xvfrstp.b xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nFind the first negative 8-bit element in `b`, set the index of the element to the lane of `a` specified by `c`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 32; i++) {\n  dst.byte[i] = a.byte[i];\n}\nint i;\nfor (i = 0; i < 16; i++) {\n  if ((s8)b.byte[i] < 0) {\n    break;\n  }\n}\ndst.byte[c.byte[0] % 16] = i;\nfor (i = 16; i < 32; i++) {\n  if ((s8)b.byte[i] < 0) {\n    break;\n  }\n}\ndst.byte[(c.byte[16] % 16) + 16] = i - 16;\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvfrstp_b (__m256i a, __m256i b, __m256i c)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfrstp.b xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Find the first negative 8-bit element in <code>b</code>, set the index of the element to the lane of <code>a</code> specified by <code>c</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 32; i++) {\n  dst.byte[i] = a.byte[i];\n}\nint i;\nfor (i = 0; i &lt; 16; i++) {\n  if ((s8)b.byte[i] &lt; 0) {\n    break;\n  }\n}\ndst.byte[c.byte[0] % 16] = i;\nfor (i = 16; i &lt; 32; i++) {\n  if ((s8)b.byte[i] &lt; 0) {\n    break;\n  }\n}\ndst.byte[(c.byte[16] % 16) + 16] = i - 16;\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvfrstp_h (__m256i a, __m256i b, __m256i c)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvfrstp_h (__m256i a, __m256i b, __m256i c)\n#include <lasxintrin.h>\nInstruction: xvfrstp.h xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nFind the first negative 16-bit element in `b`, set the index of the element to the lane of `a` specified by `c`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = a.half[i];\n}\nint i;\nfor (i = 0; i < 8; i++) {\n  if ((s16)b.half[i] < 0) {\n    break;\n  }\n}\ndst.half[c.half[0] % 8] = i;\nfor (i = 8; i < 16; i++) {\n  if ((s16)b.half[i] < 0) {\n    break;\n  }\n}\ndst.half[(c.half[8] % 8) + 8] = i - 8;\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvfrstp_h (__m256i a, __m256i b, __m256i c)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfrstp.h xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Find the first negative 16-bit element in <code>b</code>, set the index of the element to the lane of <code>a</code> specified by <code>c</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = a.half[i];\n}\nint i;\nfor (i = 0; i &lt; 8; i++) {\n  if ((s16)b.half[i] &lt; 0) {\n    break;\n  }\n}\ndst.half[c.half[0] % 8] = i;\nfor (i = 8; i &lt; 16; i++) {\n  if ((s16)b.half[i] &lt; 0) {\n    break;\n  }\n}\ndst.half[(c.half[8] % 8) + 8] = i - 8;\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvfrstpi_b (__m256i a, __m256i b, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvfrstpi_b (__m256i a, __m256i b, imm0_31 imm)\n#include <lasxintrin.h>\nInstruction: xvfrstpi.b xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nFind the first negative 8-bit element in `b`, set the index of the element to the lane of `a` specified by `imm`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 32; i++) {\n  dst.byte[i] = a.byte[i];\n}\nint i;\nfor (i = 0; i < 16; i++) {\n  if ((s8)b.byte[i] < 0) {\n    break;\n  }\n}\ndst.byte[imm % 16] = i;\nfor (i = 16; i < 32; i++) {\n  if ((s8)b.byte[i] < 0) {\n    break;\n  }\n}\ndst.byte[(imm % 16) + 16] = i - 16;\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvfrstpi_b (__m256i a, __m256i b, imm0_31 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfrstpi.b xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Find the first negative 8-bit element in <code>b</code>, set the index of the element to the lane of <code>a</code> specified by <code>imm</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 32; i++) {\n  dst.byte[i] = a.byte[i];\n}\nint i;\nfor (i = 0; i &lt; 16; i++) {\n  if ((s8)b.byte[i] &lt; 0) {\n    break;\n  }\n}\ndst.byte[imm % 16] = i;\nfor (i = 16; i &lt; 32; i++) {\n  if ((s8)b.byte[i] &lt; 0) {\n    break;\n  }\n}\ndst.byte[(imm % 16) + 16] = i - 16;\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvfrstpi_h (__m256i a, __m256i b, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvfrstpi_h (__m256i a, __m256i b, imm0_31 imm)\n#include <lasxintrin.h>\nInstruction: xvfrstpi.h xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nFind the first negative 16-bit element in `b`, set the index of the element to the lane of `a` specified by `imm`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = a.half[i];\n}\nint i;\nfor (i = 0; i < 8; i++) {\n  if ((s16)b.half[i] < 0) {\n    break;\n  }\n}\ndst.half[imm % 8] = i;\nfor (i = 8; i < 16; i++) {\n  if ((s16)b.half[i] < 0) {\n    break;\n  }\n}\ndst.half[(imm % 8) + 8] = i - 8;\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvfrstpi_h (__m256i a, __m256i b, imm0_31 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvfrstpi.h xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Find the first negative 16-bit element in <code>b</code>, set the index of the element to the lane of <code>a</code> specified by <code>imm</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = a.half[i];\n}\nint i;\nfor (i = 0; i &lt; 8; i++) {\n  if ((s16)b.half[i] &lt; 0) {\n    break;\n  }\n}\ndst.half[imm % 8] = i;\nfor (i = 8; i &lt; 16; i++) {\n  if ((s16)b.half[i] &lt; 0) {\n    break;\n  }\n}\ndst.half[(imm % 8) + 8] = i - 8;\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvftint_l_d (__m256d a)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvftint_l_d (__m256d a)\n#include <lasxintrin.h>\nInstruction: xvftint.l.d xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nConvert double-precision floating point elements in `a` to signed 64-bit integer, using current rounding mode specified in `fscr`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = (s64)a.fp64[i]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 4 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvftint_l_d (__m256d a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvftint.l.d xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Convert double-precision floating point elements in <code>a</code> to signed 64-bit integer, using current rounding mode specified in <code>fscr</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = (s64)a.fp64[i]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Conversion", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvftint_lu_d (__m256d a)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvftint_lu_d (__m256d a)\n#include <lasxintrin.h>\nInstruction: xvftint.lu.d xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nConvert double-precision floating point elements in `a` to unsigned 64-bit integer, using current rounding mode specified in `fscr`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = (u64)a.fp64[i]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 4 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvftint_lu_d (__m256d a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvftint.lu.d xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Convert double-precision floating point elements in <code>a</code> to unsigned 64-bit integer, using current rounding mode specified in <code>fscr</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = (u64)a.fp64[i]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Conversion", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvftint_w_d (__m256d a, __m256d b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvftint_w_d (__m256d a, __m256d b)\n#include <lasxintrin.h>\nInstruction: xvftint.w.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nConvert double-precision floating point elements in `a` and `b` to 32-bit integer, using current rounding mode specified in `fscr`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = (i < 2)\n                     ? (s64)a.fp64[i]\n                     : (s64)b.fp64[i]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 5 | 2 |\n| 3C5000 | 5 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvftint_w_d (__m256d a, __m256d b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvftint.w.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Convert double-precision floating point elements in <code>a</code> and <code>b</code> to 32-bit integer, using current rounding mode specified in <code>fscr</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = (i &lt; 2)\n                     ? (s64)a.fp64[i]\n                     : (s64)b.fp64[i]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>5</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>5</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Conversion", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvftint_w_s (__m256 a)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvftint_w_s (__m256 a)\n#include <lasxintrin.h>\nInstruction: xvftint.w.s xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nConvert single-precision floating point elements in `a` to signed 32-bit integer, using current rounding mode specified in `fscr`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 4 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvftint_w_s (__m256 a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvftint.w.s xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Convert single-precision floating point elements in <code>a</code> to signed 32-bit integer, using current rounding mode specified in <code>fscr</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Conversion", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvftint_wu_s (__m256 a)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvftint_wu_s (__m256 a)\n#include <lasxintrin.h>\nInstruction: xvftint.wu.s xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nConvert single-precision floating point elements in `a` to unsigned 32-bit integer, using current rounding mode specified in `fscr`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = (u32)a.fp32[i]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 4 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvftint_wu_s (__m256 a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvftint.wu.s xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Convert single-precision floating point elements in <code>a</code> to unsigned 32-bit integer, using current rounding mode specified in <code>fscr</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = (u32)a.fp32[i]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Conversion", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvftinth_l_s (__m256 a)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvftinth_l_s (__m256 a)\n#include <lasxintrin.h>\nInstruction: xvftinth.l.s xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nConvert single-precision floating point elements in higher part of `a` to 64-bit integer, using current rounding mode specified in `fscr`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = (s32)a.fp32[i + 4]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 5 | 2 |\n| 3C5000 | 5 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvftinth_l_s (__m256 a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvftinth.l.s xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Convert single-precision floating point elements in higher part of <code>a</code> to 64-bit integer, using current rounding mode specified in <code>fscr</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = (s32)a.fp32[i + 4]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>5</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>5</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Conversion", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvftintl_l_s (__m256 a)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvftintl_l_s (__m256 a)\n#include <lasxintrin.h>\nInstruction: xvftintl.l.s xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nConvert single-precision floating point elements in lower part of `a` to 64-bit integer, using current rounding mode specified in `fscr`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 5 | 2 |\n| 3C5000 | 5 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvftintl_l_s (__m256 a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvftintl.l.s xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Convert single-precision floating point elements in lower part of <code>a</code> to 64-bit integer, using current rounding mode specified in <code>fscr</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>5</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>5</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Conversion", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvftintrm_l_d (__m256d a)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvftintrm_l_d (__m256d a)\n#include <lasxintrin.h>\nInstruction: xvftintrm.l.d xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nConvert double-precision floating point elements in `a` to signed 64-bit integer, rounding towards negative infinity.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = (s64)a.fp64[i]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 4 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvftintrm_l_d (__m256d a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvftintrm.l.d xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Convert double-precision floating point elements in <code>a</code> to signed 64-bit integer, rounding towards negative infinity.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = (s64)a.fp64[i]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Conversion", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvftintrm_w_d (__m256d a, __m256d b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvftintrm_w_d (__m256d a, __m256d b)\n#include <lasxintrin.h>\nInstruction: xvftintrm.w.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nConvert double-precision floating point elements in `a` and `b` to 32-bit integer, rounding towards negative infinity.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = (i < 2)\n                     ? (s64)a.fp64[i]\n                     : (s64)b.fp64[i]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 5 | 2 |\n| 3C5000 | 5 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvftintrm_w_d (__m256d a, __m256d b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvftintrm.w.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Convert double-precision floating point elements in <code>a</code> and <code>b</code> to 32-bit integer, rounding towards negative infinity.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = (i &lt; 2)\n                     ? (s64)a.fp64[i]\n                     : (s64)b.fp64[i]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>5</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>5</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Conversion", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvftintrm_w_s (__m256 a)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvftintrm_w_s (__m256 a)\n#include <lasxintrin.h>\nInstruction: xvftintrm.w.s xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nConvert single-precision floating point elements in `a` to signed 32-bit integer, rounding towards negative infinity.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 4 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvftintrm_w_s (__m256 a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvftintrm.w.s xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Convert single-precision floating point elements in <code>a</code> to signed 32-bit integer, rounding towards negative infinity.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Conversion", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvftintrmh_l_s (__m256 a)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvftintrmh_l_s (__m256 a)\n#include <lasxintrin.h>\nInstruction: xvftintrmh.l.s xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nConvert single-precision floating point elements in higher part of `a` to 64-bit integer, rounding towards negative infinity.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = (s32)a.fp32[i + 4]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 5 | 2 |\n| 3C5000 | 5 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvftintrmh_l_s (__m256 a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvftintrmh.l.s xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Convert single-precision floating point elements in higher part of <code>a</code> to 64-bit integer, rounding towards negative infinity.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = (s32)a.fp32[i + 4]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>5</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>5</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Conversion", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvftintrml_l_s (__m256 a)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvftintrml_l_s (__m256 a)\n#include <lasxintrin.h>\nInstruction: xvftintrml.l.s xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nConvert single-precision floating point elements in lower part of `a` to 64-bit integer, rounding towards negative infinity.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 5 | 2 |\n| 3C5000 | 5 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvftintrml_l_s (__m256 a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvftintrml.l.s xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Convert single-precision floating point elements in lower part of <code>a</code> to 64-bit integer, rounding towards negative infinity.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>5</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>5</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Conversion", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvftintrne_l_d (__m256d a)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvftintrne_l_d (__m256d a)\n#include <lasxintrin.h>\nInstruction: xvftintrne.l.d xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nConvert double-precision floating point elements in `a` to signed 64-bit integer, rounding towards nearest even.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = (s64)a.fp64[i]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 4 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvftintrne_l_d (__m256d a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvftintrne.l.d xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Convert double-precision floating point elements in <code>a</code> to signed 64-bit integer, rounding towards nearest even.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = (s64)a.fp64[i]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Conversion", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvftintrne_w_d (__m256d a, __m256d b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvftintrne_w_d (__m256d a, __m256d b)\n#include <lasxintrin.h>\nInstruction: xvftintrne.w.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nConvert double-precision floating point elements in `a` and `b` to 32-bit integer, rounding towards nearest even.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = (i < 2)\n                     ? (s64)a.fp64[i]\n                     : (s64)b.fp64[i]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 5 | 2 |\n| 3C5000 | 5 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvftintrne_w_d (__m256d a, __m256d b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvftintrne.w.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Convert double-precision floating point elements in <code>a</code> and <code>b</code> to 32-bit integer, rounding towards nearest even.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = (i &lt; 2)\n                     ? (s64)a.fp64[i]\n                     : (s64)b.fp64[i]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>5</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>5</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Conversion", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvftintrne_w_s (__m256 a)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvftintrne_w_s (__m256 a)\n#include <lasxintrin.h>\nInstruction: xvftintrne.w.s xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nConvert single-precision floating point elements in `a` to signed 32-bit integer, rounding towards nearest even.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 4 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvftintrne_w_s (__m256 a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvftintrne.w.s xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Convert single-precision floating point elements in <code>a</code> to signed 32-bit integer, rounding towards nearest even.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Conversion", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvftintrneh_l_s (__m256 a)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvftintrneh_l_s (__m256 a)\n#include <lasxintrin.h>\nInstruction: xvftintrneh.l.s xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nConvert single-precision floating point elements in higher part of `a` to 64-bit integer, rounding towards nearest even.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = (s32)a.fp32[i + 4]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 5 | 2 |\n| 3C5000 | 5 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvftintrneh_l_s (__m256 a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvftintrneh.l.s xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Convert single-precision floating point elements in higher part of <code>a</code> to 64-bit integer, rounding towards nearest even.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = (s32)a.fp32[i + 4]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>5</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>5</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Conversion", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvftintrnel_l_s (__m256 a)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvftintrnel_l_s (__m256 a)\n#include <lasxintrin.h>\nInstruction: xvftintrnel.l.s xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nConvert single-precision floating point elements in lower part of `a` to 64-bit integer, rounding towards nearest even.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 5 | 2 |\n| 3C5000 | 5 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvftintrnel_l_s (__m256 a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvftintrnel.l.s xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Convert single-precision floating point elements in lower part of <code>a</code> to 64-bit integer, rounding towards nearest even.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>5</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>5</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Conversion", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvftintrp_l_d (__m256d a)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvftintrp_l_d (__m256d a)\n#include <lasxintrin.h>\nInstruction: xvftintrp.l.d xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nConvert double-precision floating point elements in `a` to signed 64-bit integer, rounding towards positive infinity.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = (s64)a.fp64[i]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 4 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvftintrp_l_d (__m256d a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvftintrp.l.d xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Convert double-precision floating point elements in <code>a</code> to signed 64-bit integer, rounding towards positive infinity.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = (s64)a.fp64[i]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Conversion", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvftintrp_w_d (__m256d a, __m256d b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvftintrp_w_d (__m256d a, __m256d b)\n#include <lasxintrin.h>\nInstruction: xvftintrp.w.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nConvert double-precision floating point elements in `a` and `b` to 32-bit integer, rounding towards positive infinity.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = (i < 2)\n                     ? (s64)a.fp64[i]\n                     : (s64)b.fp64[i]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 5 | 2 |\n| 3C5000 | 5 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvftintrp_w_d (__m256d a, __m256d b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvftintrp.w.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Convert double-precision floating point elements in <code>a</code> and <code>b</code> to 32-bit integer, rounding towards positive infinity.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = (i &lt; 2)\n                     ? (s64)a.fp64[i]\n                     : (s64)b.fp64[i]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>5</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>5</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Conversion", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvftintrp_w_s (__m256 a)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvftintrp_w_s (__m256 a)\n#include <lasxintrin.h>\nInstruction: xvftintrp.w.s xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nConvert single-precision floating point elements in `a` to signed 32-bit integer, rounding towards positive infinity.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 4 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvftintrp_w_s (__m256 a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvftintrp.w.s xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Convert single-precision floating point elements in <code>a</code> to signed 32-bit integer, rounding towards positive infinity.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Conversion", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvftintrph_l_s (__m256 a)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvftintrph_l_s (__m256 a)\n#include <lasxintrin.h>\nInstruction: xvftintrph.l.s xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nConvert single-precision floating point elements in higher part of `a` to 64-bit integer, rounding towards positive infinity.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = (s32)a.fp32[i + 4]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 5 | 2 |\n| 3C5000 | 5 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvftintrph_l_s (__m256 a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvftintrph.l.s xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Convert single-precision floating point elements in higher part of <code>a</code> to 64-bit integer, rounding towards positive infinity.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = (s32)a.fp32[i + 4]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>5</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>5</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Conversion", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvftintrpl_l_s (__m256 a)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvftintrpl_l_s (__m256 a)\n#include <lasxintrin.h>\nInstruction: xvftintrpl.l.s xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nConvert single-precision floating point elements in lower part of `a` to 64-bit integer, rounding towards positive infinity.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 5 | 2 |\n| 3C5000 | 5 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvftintrpl_l_s (__m256 a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvftintrpl.l.s xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Convert single-precision floating point elements in lower part of <code>a</code> to 64-bit integer, rounding towards positive infinity.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>5</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>5</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Conversion", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvftintrz_l_d (__m256d a)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvftintrz_l_d (__m256d a)\n#include <lasxintrin.h>\nInstruction: xvftintrz.l.d xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nConvert double-precision floating point elements in `a` to signed 64-bit integer, rounding towards zero.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = (s64)a.fp64[i]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 4 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvftintrz_l_d (__m256d a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvftintrz.l.d xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Convert double-precision floating point elements in <code>a</code> to signed 64-bit integer, rounding towards zero.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = (s64)a.fp64[i]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Conversion", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvftintrz_lu_d (__m256d a)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvftintrz_lu_d (__m256d a)\n#include <lasxintrin.h>\nInstruction: xvftintrz.lu.d xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nConvert double-precision floating point elements in `a` to unsigned 64-bit integer, rounding towards zero.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = (u64)a.fp64[i]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 4 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvftintrz_lu_d (__m256d a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvftintrz.lu.d xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Convert double-precision floating point elements in <code>a</code> to unsigned 64-bit integer, rounding towards zero.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = (u64)a.fp64[i]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Conversion", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvftintrz_w_d (__m256d a, __m256d b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvftintrz_w_d (__m256d a, __m256d b)\n#include <lasxintrin.h>\nInstruction: xvftintrz.w.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nConvert double-precision floating point elements in `a` and `b` to 32-bit integer, rounding towards zero.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = (i < 2)\n                     ? (s64)a.fp64[i]\n                     : (s64)b.fp64[i]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 5 | 2 |\n| 3C5000 | 5 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvftintrz_w_d (__m256d a, __m256d b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvftintrz.w.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Convert double-precision floating point elements in <code>a</code> and <code>b</code> to 32-bit integer, rounding towards zero.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = (i &lt; 2)\n                     ? (s64)a.fp64[i]\n                     : (s64)b.fp64[i]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>5</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>5</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Conversion", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvftintrz_w_s (__m256 a)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvftintrz_w_s (__m256 a)\n#include <lasxintrin.h>\nInstruction: xvftintrz.w.s xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nConvert single-precision floating point elements in `a` to signed 32-bit integer, rounding towards zero.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 4 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvftintrz_w_s (__m256 a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvftintrz.w.s xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Convert single-precision floating point elements in <code>a</code> to signed 32-bit integer, rounding towards zero.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Conversion", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvftintrz_wu_s (__m256 a)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvftintrz_wu_s (__m256 a)\n#include <lasxintrin.h>\nInstruction: xvftintrz.wu.s xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nConvert single-precision floating point elements in `a` to unsigned 32-bit integer, rounding towards zero.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = (u32)a.fp32[i]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 4 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvftintrz_wu_s (__m256 a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvftintrz.wu.s xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Convert single-precision floating point elements in <code>a</code> to unsigned 32-bit integer, rounding towards zero.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = (u32)a.fp32[i]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Conversion", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvftintrzh_l_s (__m256 a)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvftintrzh_l_s (__m256 a)\n#include <lasxintrin.h>\nInstruction: xvftintrzh.l.s xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nConvert single-precision floating point elements in higher part of `a` to 64-bit integer, rounding towards zero.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = (s32)a.fp32[i + 4]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 5 | 2 |\n| 3C5000 | 5 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvftintrzh_l_s (__m256 a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvftintrzh.l.s xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Convert single-precision floating point elements in higher part of <code>a</code> to 64-bit integer, rounding towards zero.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = (s32)a.fp32[i + 4]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>5</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>5</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Conversion", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvftintrzl_l_s (__m256 a)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvftintrzl_l_s (__m256 a)\n#include <lasxintrin.h>\nInstruction: xvftintrzl.l.s xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nConvert single-precision floating point elements in lower part of `a` to 64-bit integer, rounding towards zero.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C\n}\n```\n\n\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 5 | 2 |\n| 3C5000 | 5 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvftintrzl_l_s (__m256 a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvftintrzl.l.s xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Convert single-precision floating point elements in lower part of <code>a</code> to 64-bit integer, rounding towards zero.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = (s32)a.fp32[i]; // rounding mode is not expressed in C\n}\n</code></pre>\n\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>5</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>5</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Floating Point Conversion", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvhaddw_d_w (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvhaddw_d_w (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvhaddw.d.w xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nAdd odd-positioned signed 32-bit elements in `a` to even-positioned signed 32-bit elements in `b` to get 64-bit result.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = (s64)(s32)a.word[2 * i + 1] + (s64)(s32)b.word[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvhaddw_d_w (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvhaddw.d.w xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add odd-positioned signed 32-bit elements in <code>a</code> to even-positioned signed 32-bit elements in <code>b</code> to get 64-bit result.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = (s64)(s32)a.word[2 * i + 1] + (s64)(s32)b.word[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvhaddw_du_wu (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvhaddw_du_wu (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvhaddw.du.wu xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nAdd odd-positioned unsigned 32-bit elements in `a` to even-positioned unsigned 32-bit elements in `b` to get 64-bit result.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = (u64)(u32)a.word[2 * i + 1] + (u64)(u32)b.word[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvhaddw_du_wu (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvhaddw.du.wu xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add odd-positioned unsigned 32-bit elements in <code>a</code> to even-positioned unsigned 32-bit elements in <code>b</code> to get 64-bit result.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = (u64)(u32)a.word[2 * i + 1] + (u64)(u32)b.word[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvhaddw_h_b (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvhaddw_h_b (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvhaddw.h.b xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nAdd odd-positioned signed 8-bit elements in `a` to even-positioned signed 8-bit elements in `b` to get 16-bit result.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = (s16)(s8)a.byte[2 * i + 1] + (s16)(s8)b.byte[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvhaddw_h_b (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvhaddw.h.b xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add odd-positioned signed 8-bit elements in <code>a</code> to even-positioned signed 8-bit elements in <code>b</code> to get 16-bit result.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = (s16)(s8)a.byte[2 * i + 1] + (s16)(s8)b.byte[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvhaddw_hu_bu (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvhaddw_hu_bu (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvhaddw.hu.bu xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nAdd odd-positioned unsigned 8-bit elements in `a` to even-positioned unsigned 8-bit elements in `b` to get 16-bit result.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = (u16)(u8)a.byte[2 * i + 1] + (u16)(u8)b.byte[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvhaddw_hu_bu (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvhaddw.hu.bu xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add odd-positioned unsigned 8-bit elements in <code>a</code> to even-positioned unsigned 8-bit elements in <code>b</code> to get 16-bit result.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = (u16)(u8)a.byte[2 * i + 1] + (u16)(u8)b.byte[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvhaddw_q_d (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvhaddw_q_d (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvhaddw.q.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nAdd odd-positioned signed 64-bit elements in `a` to even-positioned signed 64-bit elements in `b` to get 128-bit result.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.qword[i] = (s128)(s64)a.dword[2 * i + 1] + (s128)(s64)b.dword[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvhaddw_q_d (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvhaddw.q.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add odd-positioned signed 64-bit elements in <code>a</code> to even-positioned signed 64-bit elements in <code>b</code> to get 128-bit result.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.qword[i] = (s128)(s64)a.dword[2 * i + 1] + (s128)(s64)b.dword[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvhaddw_qu_du (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvhaddw_qu_du (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvhaddw.qu.du xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nAdd odd-positioned unsigned 64-bit elements in `a` to even-positioned unsigned 64-bit elements in `b` to get 128-bit result.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] + (u128)(u64)b.dword[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvhaddw_qu_du (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvhaddw.qu.du xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add odd-positioned unsigned 64-bit elements in <code>a</code> to even-positioned unsigned 64-bit elements in <code>b</code> to get 128-bit result.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] + (u128)(u64)b.dword[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvhaddw_w_h (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvhaddw_w_h (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvhaddw.w.h xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nAdd odd-positioned signed 16-bit elements in `a` to even-positioned signed 16-bit elements in `b` to get 32-bit result.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = (s32)(s16)a.half[2 * i + 1] + (s32)(s16)b.half[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvhaddw_w_h (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvhaddw.w.h xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add odd-positioned signed 16-bit elements in <code>a</code> to even-positioned signed 16-bit elements in <code>b</code> to get 32-bit result.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = (s32)(s16)a.half[2 * i + 1] + (s32)(s16)b.half[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvhaddw_wu_hu (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvhaddw_wu_hu (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvhaddw.wu.hu xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nAdd odd-positioned unsigned 16-bit elements in `a` to even-positioned unsigned 16-bit elements in `b` to get 32-bit result.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = (u32)(u16)a.half[2 * i + 1] + (u32)(u16)b.half[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvhaddw_wu_hu (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvhaddw.wu.hu xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Add odd-positioned unsigned 16-bit elements in <code>a</code> to even-positioned unsigned 16-bit elements in <code>b</code> to get 32-bit result.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = (u32)(u16)a.half[2 * i + 1] + (u32)(u16)b.half[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvhsubw_d_w (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvhsubw_d_w (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvhsubw.d.w xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nSubtract odd-positioned signed 32-bit elements in `a` by even-positioned signed 32-bit elements in `b` to get 64-bit result.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = (s64)(s32)a.word[2 * i + 1] - (s64)(s32)b.word[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvhsubw_d_w (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvhsubw.d.w xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Subtract odd-positioned signed 32-bit elements in <code>a</code> by even-positioned signed 32-bit elements in <code>b</code> to get 64-bit result.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = (s64)(s32)a.word[2 * i + 1] - (s64)(s32)b.word[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvhsubw_du_wu (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvhsubw_du_wu (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvhsubw.du.wu xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nSubtract odd-positioned unsigned 32-bit elements in `a` by even-positioned unsigned 32-bit elements in `b` to get 64-bit result.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = (u64)(u32)a.word[2 * i + 1] - (u64)(u32)b.word[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvhsubw_du_wu (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvhsubw.du.wu xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Subtract odd-positioned unsigned 32-bit elements in <code>a</code> by even-positioned unsigned 32-bit elements in <code>b</code> to get 64-bit result.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = (u64)(u32)a.word[2 * i + 1] - (u64)(u32)b.word[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvhsubw_h_b (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvhsubw_h_b (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvhsubw.h.b xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nSubtract odd-positioned signed 8-bit elements in `a` by even-positioned signed 8-bit elements in `b` to get 16-bit result.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = (s16)(s8)a.byte[2 * i + 1] - (s16)(s8)b.byte[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvhsubw_h_b (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvhsubw.h.b xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Subtract odd-positioned signed 8-bit elements in <code>a</code> by even-positioned signed 8-bit elements in <code>b</code> to get 16-bit result.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = (s16)(s8)a.byte[2 * i + 1] - (s16)(s8)b.byte[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvhsubw_hu_bu (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvhsubw_hu_bu (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvhsubw.hu.bu xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nSubtract odd-positioned unsigned 8-bit elements in `a` by even-positioned unsigned 8-bit elements in `b` to get 16-bit result.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = (u16)(u8)a.byte[2 * i + 1] - (u16)(u8)b.byte[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvhsubw_hu_bu (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvhsubw.hu.bu xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Subtract odd-positioned unsigned 8-bit elements in <code>a</code> by even-positioned unsigned 8-bit elements in <code>b</code> to get 16-bit result.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = (u16)(u8)a.byte[2 * i + 1] - (u16)(u8)b.byte[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvhsubw_q_d (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvhsubw_q_d (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvhsubw.q.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nSubtract odd-positioned signed 64-bit elements in `a` by even-positioned signed 64-bit elements in `b` to get 128-bit result.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.qword[i] = (s128)(s64)a.dword[2 * i + 1] - (s128)(s64)b.dword[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvhsubw_q_d (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvhsubw.q.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Subtract odd-positioned signed 64-bit elements in <code>a</code> by even-positioned signed 64-bit elements in <code>b</code> to get 128-bit result.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.qword[i] = (s128)(s64)a.dword[2 * i + 1] - (s128)(s64)b.dword[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvhsubw_qu_du (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvhsubw_qu_du (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvhsubw.qu.du xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nSubtract odd-positioned unsigned 64-bit elements in `a` by even-positioned unsigned 64-bit elements in `b` to get 128-bit result.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] - (u128)(u64)b.dword[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvhsubw_qu_du (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvhsubw.qu.du xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Subtract odd-positioned unsigned 64-bit elements in <code>a</code> by even-positioned unsigned 64-bit elements in <code>b</code> to get 128-bit result.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] - (u128)(u64)b.dword[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvhsubw_w_h (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvhsubw_w_h (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvhsubw.w.h xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nSubtract odd-positioned signed 16-bit elements in `a` by even-positioned signed 16-bit elements in `b` to get 32-bit result.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = (s32)(s16)a.half[2 * i + 1] - (s32)(s16)b.half[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvhsubw_w_h (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvhsubw.w.h xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Subtract odd-positioned signed 16-bit elements in <code>a</code> by even-positioned signed 16-bit elements in <code>b</code> to get 32-bit result.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = (s32)(s16)a.half[2 * i + 1] - (s32)(s16)b.half[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvhsubw_wu_hu (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvhsubw_wu_hu (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvhsubw.wu.hu xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nSubtract odd-positioned unsigned 16-bit elements in `a` by even-positioned unsigned 16-bit elements in `b` to get 32-bit result.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = (u32)(u16)a.half[2 * i + 1] - (u32)(u16)b.half[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvhsubw_wu_hu (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvhsubw.wu.hu xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Subtract odd-positioned unsigned 16-bit elements in <code>a</code> by even-positioned unsigned 16-bit elements in <code>b</code> to get 32-bit result.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = (u32)(u16)a.half[2 * i + 1] - (u32)(u16)b.half[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvilvh_b (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvilvh_b (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvilvh.b xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nInterleave 8-bit elements in higher half of `a` and `b`.\n\n\n\n\n\n### Operation\n\n```c++\nint i;\nfor (i = 0; i < 16; i++) {\n  dst.byte[i] = (i % 2 == 1) ? a.byte[i / 2 + 8] : b.byte[i / 2 + 8];\n}\nfor (; i < 32; i++) {\n  dst.byte[i] = (i % 2 == 1) ? a.byte[i / 2 + 16] : b.byte[i / 2 + 16];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvilvh_b (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvilvh.b xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Interleave 8-bit elements in higher half of <code>a</code> and <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">int i;\nfor (i = 0; i &lt; 16; i++) {\n  dst.byte[i] = (i % 2 == 1) ? a.byte[i / 2 + 8] : b.byte[i / 2 + 8];\n}\nfor (; i &lt; 32; i++) {\n  dst.byte[i] = (i % 2 == 1) ? a.byte[i / 2 + 16] : b.byte[i / 2 + 16];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvilvh_d (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvilvh_d (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvilvh.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nInterleave 64-bit elements in higher half of `a` and `b`.\n\n\n\n\n\n### Operation\n\n```c++\nint i;\nfor (i = 0; i < 2; i++) {\n  dst.dword[i] = (i % 2 == 1) ? a.dword[i / 2 + 1] : b.dword[i / 2 + 1];\n}\nfor (; i < 4; i++) {\n  dst.dword[i] = (i % 2 == 1) ? a.dword[i / 2 + 2] : b.dword[i / 2 + 2];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvilvh_d (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvilvh.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Interleave 64-bit elements in higher half of <code>a</code> and <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">int i;\nfor (i = 0; i &lt; 2; i++) {\n  dst.dword[i] = (i % 2 == 1) ? a.dword[i / 2 + 1] : b.dword[i / 2 + 1];\n}\nfor (; i &lt; 4; i++) {\n  dst.dword[i] = (i % 2 == 1) ? a.dword[i / 2 + 2] : b.dword[i / 2 + 2];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvilvh_h (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvilvh_h (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvilvh.h xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nInterleave 16-bit elements in higher half of `a` and `b`.\n\n\n\n\n\n### Operation\n\n```c++\nint i;\nfor (i = 0; i < 8; i++) {\n  dst.half[i] = (i % 2 == 1) ? a.half[i / 2 + 4] : b.half[i / 2 + 4];\n}\nfor (; i < 16; i++) {\n  dst.half[i] = (i % 2 == 1) ? a.half[i / 2 + 8] : b.half[i / 2 + 8];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvilvh_h (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvilvh.h xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Interleave 16-bit elements in higher half of <code>a</code> and <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">int i;\nfor (i = 0; i &lt; 8; i++) {\n  dst.half[i] = (i % 2 == 1) ? a.half[i / 2 + 4] : b.half[i / 2 + 4];\n}\nfor (; i &lt; 16; i++) {\n  dst.half[i] = (i % 2 == 1) ? a.half[i / 2 + 8] : b.half[i / 2 + 8];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvilvh_w (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvilvh_w (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvilvh.w xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nInterleave 32-bit elements in higher half of `a` and `b`.\n\n\n\n\n\n### Operation\n\n```c++\nint i;\nfor (i = 0; i < 4; i++) {\n  dst.word[i] = (i % 2 == 1) ? a.word[i / 2 + 2] : b.word[i / 2 + 2];\n}\nfor (; i < 8; i++) {\n  dst.word[i] = (i % 2 == 1) ? a.word[i / 2 + 4] : b.word[i / 2 + 4];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvilvh_w (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvilvh.w xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Interleave 32-bit elements in higher half of <code>a</code> and <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">int i;\nfor (i = 0; i &lt; 4; i++) {\n  dst.word[i] = (i % 2 == 1) ? a.word[i / 2 + 2] : b.word[i / 2 + 2];\n}\nfor (; i &lt; 8; i++) {\n  dst.word[i] = (i % 2 == 1) ? a.word[i / 2 + 4] : b.word[i / 2 + 4];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvilvl_b (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvilvl_b (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvilvl.b xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nInterleave 8-bit elements in lower half of `a` and `b`.\n\n\n\n\n\n### Operation\n\n```c++\nint i;\nfor (i = 0; i < 16; i++) {\n  dst.byte[i] = (i % 2 == 1) ? a.byte[i / 2] : b.byte[i / 2];\n}\nfor (; i < 32; i++) {\n  dst.byte[i] = (i % 2 == 1) ? a.byte[i / 2 + 8] : b.byte[i / 2 + 8];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvilvl_b (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvilvl.b xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Interleave 8-bit elements in lower half of <code>a</code> and <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">int i;\nfor (i = 0; i &lt; 16; i++) {\n  dst.byte[i] = (i % 2 == 1) ? a.byte[i / 2] : b.byte[i / 2];\n}\nfor (; i &lt; 32; i++) {\n  dst.byte[i] = (i % 2 == 1) ? a.byte[i / 2 + 8] : b.byte[i / 2 + 8];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvilvl_d (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvilvl_d (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvilvl.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nInterleave 64-bit elements in lower half of `a` and `b`.\n\n\n\n\n\n### Operation\n\n```c++\nint i;\nfor (i = 0; i < 2; i++) {\n  dst.dword[i] = (i % 2 == 1) ? a.dword[i / 2] : b.dword[i / 2];\n}\nfor (; i < 4; i++) {\n  dst.dword[i] = (i % 2 == 1) ? a.dword[i / 2 + 1] : b.dword[i / 2 + 1];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvilvl_d (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvilvl.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Interleave 64-bit elements in lower half of <code>a</code> and <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">int i;\nfor (i = 0; i &lt; 2; i++) {\n  dst.dword[i] = (i % 2 == 1) ? a.dword[i / 2] : b.dword[i / 2];\n}\nfor (; i &lt; 4; i++) {\n  dst.dword[i] = (i % 2 == 1) ? a.dword[i / 2 + 1] : b.dword[i / 2 + 1];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvilvl_h (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvilvl_h (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvilvl.h xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nInterleave 16-bit elements in lower half of `a` and `b`.\n\n\n\n\n\n### Operation\n\n```c++\nint i;\nfor (i = 0; i < 8; i++) {\n  dst.half[i] = (i % 2 == 1) ? a.half[i / 2] : b.half[i / 2];\n}\nfor (; i < 16; i++) {\n  dst.half[i] = (i % 2 == 1) ? a.half[i / 2 + 4] : b.half[i / 2 + 4];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvilvl_h (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvilvl.h xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Interleave 16-bit elements in lower half of <code>a</code> and <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">int i;\nfor (i = 0; i &lt; 8; i++) {\n  dst.half[i] = (i % 2 == 1) ? a.half[i / 2] : b.half[i / 2];\n}\nfor (; i &lt; 16; i++) {\n  dst.half[i] = (i % 2 == 1) ? a.half[i / 2 + 4] : b.half[i / 2 + 4];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvilvl_w (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvilvl_w (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvilvl.w xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nInterleave 32-bit elements in lower half of `a` and `b`.\n\n\n\n\n\n### Operation\n\n```c++\nint i;\nfor (i = 0; i < 4; i++) {\n  dst.word[i] = (i % 2 == 1) ? a.word[i / 2] : b.word[i / 2];\n}\nfor (; i < 8; i++) {\n  dst.word[i] = (i % 2 == 1) ? a.word[i / 2 + 2] : b.word[i / 2 + 2];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvilvl_w (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvilvl.w xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Interleave 32-bit elements in lower half of <code>a</code> and <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">int i;\nfor (i = 0; i &lt; 4; i++) {\n  dst.word[i] = (i % 2 == 1) ? a.word[i / 2] : b.word[i / 2];\n}\nfor (; i &lt; 8; i++) {\n  dst.word[i] = (i % 2 == 1) ? a.word[i / 2 + 2] : b.word[i / 2 + 2];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvinsgr2vr_d (__m256i a, long int b, imm0_3 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvinsgr2vr_d (__m256i a, long int b, imm0_3 imm)\n#include <lasxintrin.h>\nInstruction: xvinsgr2vr.d xr, r, imm\nCPU Flags: LASX\n```\n\n### Description\n\nInsert 64-bit element into lane indexed `imm`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = (i == imm) ? b : a.dword[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 1 |\n| 3C5000 | 1 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvinsgr2vr_d (__m256i a, long int b, imm0_3 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvinsgr2vr.d xr, r, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Insert 64-bit element into lane indexed <code>imm</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = (i == imm) ? b : a.dword[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>1</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvinsgr2vr_w (__m256i a, int b, imm0_7 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvinsgr2vr_w (__m256i a, int b, imm0_7 imm)\n#include <lasxintrin.h>\nInstruction: xvinsgr2vr.w xr, r, imm\nCPU Flags: LASX\n```\n\n### Description\n\nInsert 32-bit element into lane indexed `imm`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = (i == imm) ? b : a.word[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 1 |\n| 3C5000 | 1 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvinsgr2vr_w (__m256i a, int b, imm0_7 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvinsgr2vr.w xr, r, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Insert 32-bit element into lane indexed <code>imm</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = (i == imm) ? b : a.word[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>1</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvinsve0_d (__m256i a, __m256i b, imm0_3 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvinsve0_d (__m256i a, __m256i b, imm0_3 imm)\n#include <lasxintrin.h>\nInstruction: xvinsve0.d xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nInsert the first 64-bit lane of `b` into lane indexed `imm` of `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = (i == imm) ? b.dword[0] : a.dword[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvinsve0_d (__m256i a, __m256i b, imm0_3 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvinsve0.d xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Insert the first 64-bit lane of <code>b</code> into lane indexed <code>imm</code> of <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = (i == imm) ? b.dword[0] : a.dword[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvinsve0_w (__m256i a, __m256i b, imm0_7 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvinsve0_w (__m256i a, __m256i b, imm0_7 imm)\n#include <lasxintrin.h>\nInstruction: xvinsve0.w xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nInsert the first 32-bit lane of `b` into lane indexed `imm` of `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = (i == imm) ? b.word[0] : a.word[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvinsve0_w (__m256i a, __m256i b, imm0_7 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvinsve0.w xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Insert the first 32-bit lane of <code>b</code> into lane indexed <code>imm</code> of <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = (i == imm) ? b.word[0] : a.word[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvld (void * addr, imm_n2048_2047 offset)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvld (void * addr, imm_n2048_2047 offset)\n#include <lasxintrin.h>\nInstruction: xvld xr, r, imm\nCPU Flags: LASX\n```\n\n### Description\n\nRead whole vector from memory address `addr + offset`, save the data into `dst`. Note that you can use this intrinsic to load floating point vectors, even though the return type represents integer vectors.\n\n\n\n\n\n### Operation\n\n```c++\ndst = memory_load(256, addr + offset);\n```", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvld (void * addr, imm_n2048_2047 offset)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvld xr, r, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Read whole vector from memory address <code>addr + offset</code>, save the data into <code>dst</code>. Note that you can use this intrinsic to load floating point vectors, even though the return type represents integer vectors.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">dst = memory_load(256, addr + offset);\n</code></pre>", "group": "Memory Load & Store", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvldi (imm_n1024_1023 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvldi (imm_n1024_1023 imm)\n#include <lasxintrin.h>\nInstruction: xvldi xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\n\nInitialize `dst` using predefined patterns:\n\n- `imm[12:10]=0b000`: broadcast `imm[7:0]` as 8-bit elements to all lanes\n- `imm[12:10]=0b001`: broadcast sign-extended `imm[9:0]` as 16-bit elements to all lanes\n- `imm[12:10]=0b010`: broadcast sign-extended `imm[9:0]` as 32-bit elements to all lanes\n- `imm[12:10]=0b011`: broadcast sign-extended `imm[9:0]` as 64-bit elements to all lanes\n- `imm[12:8]=0b10000`: broadcast `imm[7:0]` as 32-bit elements to all lanes\n- `imm[12:8]=0b10001`: broadcast `imm[7:0] << 8` as 32-bit elements to all lanes\n- `imm[12:8]=0b10010`: broadcast `imm[7:0] << 16` as 32-bit elements to all lanes\n- `imm[12:8]=0b10011`: broadcast `imm[7:0] << 24` as 32-bit elements to all lanes\n- `imm[12:8]=0b10100`: broadcast `imm[7:0]` as 16-bit elements to all lanes\n- `imm[12:8]=0b10101`: broadcast `imm[7:0] << 8` as 16-bit elements to all lanes\n- `imm[12:8]=0b10110`: broadcast `(imm[7:0] << 8) | 0xFF` as 32-bit elements to all lanes\n- `imm[12:8]=0b10111`: broadcast `(imm[7:0] << 16) | 0xFFFF` as 32-bit elements to all lanes\n- `imm[12:8]=0b11000`: broadcast `imm[7:0]` as 8-bit elements to all lanes\n- `imm[12:8]=0b11001`: repeat each bit of `imm[7:0]` eight times, and broadcast the result as 64-bit elements to all lanes\n- `imm[12:8]=0b11010`: broadcast `(imm[7] << 31) | ((1-imm[6]) << 30) | ((imm[6] * 0x1F) << 25) | (imm[5:0] << 19)` as 32-bit elements to all lanes\n- `imm[12:8]=0b11011`: broadcast `(imm[7] << 31) | ((1-imm[6]) << 30) | ((imm[6] * 0x1F) << 25) | (imm[5:0] << 19)` as 64-bit elements to all lanes\n- `imm[12:8]=0b11100`: broadcast `(imm[7] << 63) | ((1-imm[6]) << 62) | ((imm[6] * 0xFF) << 54) | (imm[5:0] << 48)` as 64-bit elements to all lanes\n\n\n\n\n\n\n### Operation\n\n```c++\nu64 imm12_10 = (imm >> 10) & 0b111;\nu64 imm12_8 = (imm >> 8) & 0b11111;\nu64 imm9_0 = imm & 0x3FF;\ns64 simm9_0 = ((s64)imm9_0 << 54) >> 54;\nu64 imm7_0 = imm & 0xFF;\nu64 imm7 = (imm >> 7) & 0x1;\nu64 imm6 = (imm >> 6) & 0x1;\nu64 imm5 = (imm >> 5) & 0x1;\nu64 imm5_0 = imm & 0x3F;\nu64 imm4 = (imm >> 4) & 0x1;\nu64 imm3 = (imm >> 3) & 0x1;\nu64 imm2 = (imm >> 2) & 0x1;\nu64 imm1 = (imm >> 1) & 0x1;\nu64 imm0 = imm & 0x1;\n\nu64 broadcast_value;\nu64 broadcast_width;\nif (imm12_10 == 0b000) {\n  broadcast_value = imm7_0;\n  broadcast_width = 8;\n} else if (imm12_10 == 0b001) {\n  broadcast_value = simm9_0;\n  broadcast_width = 16;\n} else if (imm12_10 == 0b010) {\n  broadcast_value = simm9_0;\n  broadcast_width = 32;\n} else if (imm12_10 == 0b011) {\n  broadcast_value = simm9_0;\n  broadcast_width = 64;\n} else if (imm12_8 == 0b10000) {\n  broadcast_value = imm7_0;\n  broadcast_width = 32;\n} else if (imm12_8 == 0b10001) {\n  broadcast_value = imm7_0 << 8;\n  broadcast_width = 32;\n} else if (imm12_8 == 0b10010) {\n  broadcast_value = imm7_0 << 16;\n  broadcast_width = 32;\n} else if (imm12_8 == 0b10011) {\n  broadcast_value = imm7_0 << 24;\n  broadcast_width = 32;\n} else if (imm12_8 == 0b10100) {\n  broadcast_value = imm7_0;\n  broadcast_width = 16;\n} else if (imm12_8 == 0b10101) {\n  broadcast_value = imm7_0 << 8;\n  broadcast_width = 16;\n} else if (imm12_8 == 0b10110) {\n  broadcast_value = (imm7_0 << 8) | 0xFF;\n  broadcast_width = 32;\n} else if (imm12_8 == 0b10111) {\n  broadcast_value = (imm7_0 << 16) | 0xFFFF;\n  broadcast_width = 32;\n} else if (imm12_8 == 0b11000) {\n  broadcast_value = imm7_0;\n  broadcast_width = 8;\n} else if (imm12_8 == 0b11001) {\n  broadcast_value = imm0 * 0xFF + imm1 * 0xFF00 + imm2 * 0xFF0000 +\n                    imm3 * 0xFF000000 + imm4 * 0xFF00000000 +\n                    imm5 * 0xFF0000000000 + imm6 * 0xFF000000000000 +\n                    imm7 * 0xFF00000000000000;\n  broadcast_width = 64;\n} else if (imm12_8 == 0b11010) {\n  broadcast_value = (imm7 << 31) | ((1 - imm6) << 30) | ((imm6 * 0x1F) << 25) |\n                    (imm5_0 << 19);\n  broadcast_width = 32;\n} else if (imm12_8 == 0b11011) {\n  broadcast_value = (imm7 << 31) | ((1 - imm6) << 30) | ((imm6 * 0x1F) << 25) |\n                    (imm5_0 << 19);\n  broadcast_width = 64;\n} else if (imm12_8 == 0b11100) {\n  broadcast_value = (imm7 << 63) | ((1 - imm6) << 62) | ((imm6 * 0xFF) << 54) |\n                    (imm5_0 << 48);\n  broadcast_width = 64;\n}\n\nif (broadcast_width == 8) {\n  for (int i = 0; i < 32; i++) {\n    dst.byte[i] = broadcast_value;\n  }\n} else if (broadcast_width == 16) {\n  for (int i = 0; i < 16; i++) {\n    dst.half[i] = broadcast_value;\n  }\n} else if (broadcast_width == 32) {\n  for (int i = 0; i < 8; i++) {\n    dst.word[i] = broadcast_value;\n  }\n} else if (broadcast_width == 64) {\n  for (int i = 0; i < 4; i++) {\n    dst.dword[i] = broadcast_value;\n  }\n}\n```", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvldi (imm_n1024_1023 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvldi xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Initialize <code>dst</code> using predefined patterns:</p>\n<ul>\n<li><code>imm[12:10]=0b000</code>: broadcast <code>imm[7:0]</code> as 8-bit elements to all lanes</li>\n<li><code>imm[12:10]=0b001</code>: broadcast sign-extended <code>imm[9:0]</code> as 16-bit elements to all lanes</li>\n<li><code>imm[12:10]=0b010</code>: broadcast sign-extended <code>imm[9:0]</code> as 32-bit elements to all lanes</li>\n<li><code>imm[12:10]=0b011</code>: broadcast sign-extended <code>imm[9:0]</code> as 64-bit elements to all lanes</li>\n<li><code>imm[12:8]=0b10000</code>: broadcast <code>imm[7:0]</code> as 32-bit elements to all lanes</li>\n<li><code>imm[12:8]=0b10001</code>: broadcast <code>imm[7:0] &lt;&lt; 8</code> as 32-bit elements to all lanes</li>\n<li><code>imm[12:8]=0b10010</code>: broadcast <code>imm[7:0] &lt;&lt; 16</code> as 32-bit elements to all lanes</li>\n<li><code>imm[12:8]=0b10011</code>: broadcast <code>imm[7:0] &lt;&lt; 24</code> as 32-bit elements to all lanes</li>\n<li><code>imm[12:8]=0b10100</code>: broadcast <code>imm[7:0]</code> as 16-bit elements to all lanes</li>\n<li><code>imm[12:8]=0b10101</code>: broadcast <code>imm[7:0] &lt;&lt; 8</code> as 16-bit elements to all lanes</li>\n<li><code>imm[12:8]=0b10110</code>: broadcast <code>(imm[7:0] &lt;&lt; 8) | 0xFF</code> as 32-bit elements to all lanes</li>\n<li><code>imm[12:8]=0b10111</code>: broadcast <code>(imm[7:0] &lt;&lt; 16) | 0xFFFF</code> as 32-bit elements to all lanes</li>\n<li><code>imm[12:8]=0b11000</code>: broadcast <code>imm[7:0]</code> as 8-bit elements to all lanes</li>\n<li><code>imm[12:8]=0b11001</code>: repeat each bit of <code>imm[7:0]</code> eight times, and broadcast the result as 64-bit elements to all lanes</li>\n<li><code>imm[12:8]=0b11010</code>: broadcast <code>(imm[7] &lt;&lt; 31) | ((1-imm[6]) &lt;&lt; 30) | ((imm[6] * 0x1F) &lt;&lt; 25) | (imm[5:0] &lt;&lt; 19)</code> as 32-bit elements to all lanes</li>\n<li><code>imm[12:8]=0b11011</code>: broadcast <code>(imm[7] &lt;&lt; 31) | ((1-imm[6]) &lt;&lt; 30) | ((imm[6] * 0x1F) &lt;&lt; 25) | (imm[5:0] &lt;&lt; 19)</code> as 64-bit elements to all lanes</li>\n<li><code>imm[12:8]=0b11100</code>: broadcast <code>(imm[7] &lt;&lt; 63) | ((1-imm[6]) &lt;&lt; 62) | ((imm[6] * 0xFF) &lt;&lt; 54) | (imm[5:0] &lt;&lt; 48)</code> as 64-bit elements to all lanes</li>\n</ul>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">u64 imm12_10 = (imm &gt;&gt; 10) &amp; 0b111;\nu64 imm12_8 = (imm &gt;&gt; 8) &amp; 0b11111;\nu64 imm9_0 = imm &amp; 0x3FF;\ns64 simm9_0 = ((s64)imm9_0 &lt;&lt; 54) &gt;&gt; 54;\nu64 imm7_0 = imm &amp; 0xFF;\nu64 imm7 = (imm &gt;&gt; 7) &amp; 0x1;\nu64 imm6 = (imm &gt;&gt; 6) &amp; 0x1;\nu64 imm5 = (imm &gt;&gt; 5) &amp; 0x1;\nu64 imm5_0 = imm &amp; 0x3F;\nu64 imm4 = (imm &gt;&gt; 4) &amp; 0x1;\nu64 imm3 = (imm &gt;&gt; 3) &amp; 0x1;\nu64 imm2 = (imm &gt;&gt; 2) &amp; 0x1;\nu64 imm1 = (imm &gt;&gt; 1) &amp; 0x1;\nu64 imm0 = imm &amp; 0x1;\n\nu64 broadcast_value;\nu64 broadcast_width;\nif (imm12_10 == 0b000) {\n  broadcast_value = imm7_0;\n  broadcast_width = 8;\n} else if (imm12_10 == 0b001) {\n  broadcast_value = simm9_0;\n  broadcast_width = 16;\n} else if (imm12_10 == 0b010) {\n  broadcast_value = simm9_0;\n  broadcast_width = 32;\n} else if (imm12_10 == 0b011) {\n  broadcast_value = simm9_0;\n  broadcast_width = 64;\n} else if (imm12_8 == 0b10000) {\n  broadcast_value = imm7_0;\n  broadcast_width = 32;\n} else if (imm12_8 == 0b10001) {\n  broadcast_value = imm7_0 &lt;&lt; 8;\n  broadcast_width = 32;\n} else if (imm12_8 == 0b10010) {\n  broadcast_value = imm7_0 &lt;&lt; 16;\n  broadcast_width = 32;\n} else if (imm12_8 == 0b10011) {\n  broadcast_value = imm7_0 &lt;&lt; 24;\n  broadcast_width = 32;\n} else if (imm12_8 == 0b10100) {\n  broadcast_value = imm7_0;\n  broadcast_width = 16;\n} else if (imm12_8 == 0b10101) {\n  broadcast_value = imm7_0 &lt;&lt; 8;\n  broadcast_width = 16;\n} else if (imm12_8 == 0b10110) {\n  broadcast_value = (imm7_0 &lt;&lt; 8) | 0xFF;\n  broadcast_width = 32;\n} else if (imm12_8 == 0b10111) {\n  broadcast_value = (imm7_0 &lt;&lt; 16) | 0xFFFF;\n  broadcast_width = 32;\n} else if (imm12_8 == 0b11000) {\n  broadcast_value = imm7_0;\n  broadcast_width = 8;\n} else if (imm12_8 == 0b11001) {\n  broadcast_value = imm0 * 0xFF + imm1 * 0xFF00 + imm2 * 0xFF0000 +\n                    imm3 * 0xFF000000 + imm4 * 0xFF00000000 +\n                    imm5 * 0xFF0000000000 + imm6 * 0xFF000000000000 +\n                    imm7 * 0xFF00000000000000;\n  broadcast_width = 64;\n} else if (imm12_8 == 0b11010) {\n  broadcast_value = (imm7 &lt;&lt; 31) | ((1 - imm6) &lt;&lt; 30) | ((imm6 * 0x1F) &lt;&lt; 25) |\n                    (imm5_0 &lt;&lt; 19);\n  broadcast_width = 32;\n} else if (imm12_8 == 0b11011) {\n  broadcast_value = (imm7 &lt;&lt; 31) | ((1 - imm6) &lt;&lt; 30) | ((imm6 * 0x1F) &lt;&lt; 25) |\n                    (imm5_0 &lt;&lt; 19);\n  broadcast_width = 64;\n} else if (imm12_8 == 0b11100) {\n  broadcast_value = (imm7 &lt;&lt; 63) | ((1 - imm6) &lt;&lt; 62) | ((imm6 * 0xFF) &lt;&lt; 54) |\n                    (imm5_0 &lt;&lt; 48);\n  broadcast_width = 64;\n}\n\nif (broadcast_width == 8) {\n  for (int i = 0; i &lt; 32; i++) {\n    dst.byte[i] = broadcast_value;\n  }\n} else if (broadcast_width == 16) {\n  for (int i = 0; i &lt; 16; i++) {\n    dst.half[i] = broadcast_value;\n  }\n} else if (broadcast_width == 32) {\n  for (int i = 0; i &lt; 8; i++) {\n    dst.word[i] = broadcast_value;\n  }\n} else if (broadcast_width == 64) {\n  for (int i = 0; i &lt; 4; i++) {\n    dst.dword[i] = broadcast_value;\n  }\n}\n</code></pre>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvldrepl_b (void * addr, imm_n2048_2047 offset)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvldrepl_b (void * addr, imm_n2048_2047 offset)\n#include <lasxintrin.h>\nInstruction: xvldrepl.b xr, r, imm\nCPU Flags: LASX\n```\n\n### Description\n\nRead 8-bit data from memory address `addr + (offset << 0)`, replicate the data to all vector lanes and save into `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nu8 data = memory_load(8, addr + offset);\nfor (int i = 0; i < 32; i++) {\n  dst.byte[i] = data;\n}\n```", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvldrepl_b (void * addr, imm_n2048_2047 offset)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvldrepl.b xr, r, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Read 8-bit data from memory address <code>addr + (offset &lt;&lt; 0)</code>, replicate the data to all vector lanes and save into <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">u8 data = memory_load(8, addr + offset);\nfor (int i = 0; i &lt; 32; i++) {\n  dst.byte[i] = data;\n}\n</code></pre>", "group": "Memory Load & Store", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvldrepl_d (void * addr, imm_n256_255 offset)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvldrepl_d (void * addr, imm_n256_255 offset)\n#include <lasxintrin.h>\nInstruction: xvldrepl.d xr, r, imm\nCPU Flags: LASX\n```\n\n### Description\n\nRead 64-bit data from memory address `addr + (offset << 3)`, replicate the data to all vector lanes and save into `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nu64 data = memory_load(64, addr + (offset << 3));\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = data;\n}\n```", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvldrepl_d (void * addr, imm_n256_255 offset)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvldrepl.d xr, r, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Read 64-bit data from memory address <code>addr + (offset &lt;&lt; 3)</code>, replicate the data to all vector lanes and save into <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">u64 data = memory_load(64, addr + (offset &lt;&lt; 3));\nfor (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = data;\n}\n</code></pre>", "group": "Memory Load & Store", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvldrepl_h (void * addr, imm_n1024_1023 offset)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvldrepl_h (void * addr, imm_n1024_1023 offset)\n#include <lasxintrin.h>\nInstruction: xvldrepl.h xr, r, imm\nCPU Flags: LASX\n```\n\n### Description\n\nRead 16-bit data from memory address `addr + (offset << 1)`, replicate the data to all vector lanes and save into `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nu16 data = memory_load(16, addr + (offset << 1));\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = data;\n}\n```", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvldrepl_h (void * addr, imm_n1024_1023 offset)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvldrepl.h xr, r, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Read 16-bit data from memory address <code>addr + (offset &lt;&lt; 1)</code>, replicate the data to all vector lanes and save into <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">u16 data = memory_load(16, addr + (offset &lt;&lt; 1));\nfor (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = data;\n}\n</code></pre>", "group": "Memory Load & Store", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvldrepl_w (void * addr, imm_n512_511 offset)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvldrepl_w (void * addr, imm_n512_511 offset)\n#include <lasxintrin.h>\nInstruction: xvldrepl.w xr, r, imm\nCPU Flags: LASX\n```\n\n### Description\n\nRead 32-bit data from memory address `addr + (offset << 2)`, replicate the data to all vector lanes and save into `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nu32 data = memory_load(32, addr + (offset << 2));\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = data;\n}\n```", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvldrepl_w (void * addr, imm_n512_511 offset)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvldrepl.w xr, r, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Read 32-bit data from memory address <code>addr + (offset &lt;&lt; 2)</code>, replicate the data to all vector lanes and save into <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">u32 data = memory_load(32, addr + (offset &lt;&lt; 2));\nfor (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = data;\n}\n</code></pre>", "group": "Memory Load & Store", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvldx (void * addr, long int offset)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvldx (void * addr, long int offset)\n#include <lasxintrin.h>\nInstruction: xvldx xr, r, r\nCPU Flags: LASX\n```\n\n### Description\n\nRead whole vector from memory address `addr + offset`, save the data into `dst`.  Note that you can use this intrinsic to load floating point vectors, even though the return type represents integer vectors.\n\n\n\n\n\n### Operation\n\n```c++\ndst = memory_load(256, addr + offset);\n```", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvldx (void * addr, long int offset)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvldx xr, r, r\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Read whole vector from memory address <code>addr + offset</code>, save the data into <code>dst</code>.  Note that you can use this intrinsic to load floating point vectors, even though the return type represents integer vectors.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">dst = memory_load(256, addr + offset);\n</code></pre>", "group": "Memory Load & Store", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmadd_b (__m256i a, __m256i b, __m256i c)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmadd_b (__m256i a, __m256i b, __m256i c)\n#include <lasxintrin.h>\nInstruction: xvmadd.b xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nMultiply 8-bit elements in `b` and `c`, add to elements in `a`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 32; i++) {\n  dst.byte[i] = b.byte[i] * c.byte[i] + a.byte[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmadd_b (__m256i a, __m256i b, __m256i c)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmadd.b xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply 8-bit elements in <code>b</code> and <code>c</code>, add to elements in <code>a</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 32; i++) {\n  dst.byte[i] = b.byte[i] * c.byte[i] + a.byte[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmadd_d (__m256i a, __m256i b, __m256i c)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmadd_d (__m256i a, __m256i b, __m256i c)\n#include <lasxintrin.h>\nInstruction: xvmadd.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nMultiply 64-bit elements in `b` and `c`, add to elements in `a`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = b.dword[i] * c.dword[i] + a.dword[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmadd_d (__m256i a, __m256i b, __m256i c)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmadd.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply 64-bit elements in <code>b</code> and <code>c</code>, add to elements in <code>a</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = b.dword[i] * c.dword[i] + a.dword[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmadd_h (__m256i a, __m256i b, __m256i c)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmadd_h (__m256i a, __m256i b, __m256i c)\n#include <lasxintrin.h>\nInstruction: xvmadd.h xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nMultiply 16-bit elements in `b` and `c`, add to elements in `a`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = b.half[i] * c.half[i] + a.half[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmadd_h (__m256i a, __m256i b, __m256i c)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmadd.h xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply 16-bit elements in <code>b</code> and <code>c</code>, add to elements in <code>a</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = b.half[i] * c.half[i] + a.half[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmadd_w (__m256i a, __m256i b, __m256i c)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmadd_w (__m256i a, __m256i b, __m256i c)\n#include <lasxintrin.h>\nInstruction: xvmadd.w xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nMultiply 32-bit elements in `b` and `c`, add to elements in `a`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = b.word[i] * c.word[i] + a.word[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmadd_w (__m256i a, __m256i b, __m256i c)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmadd.w xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply 32-bit elements in <code>b</code> and <code>c</code>, add to elements in <code>a</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = b.word[i] * c.word[i] + a.word[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmaddwev_d_w (__m256i a, __m256i b, __m256i c)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmaddwev_d_w (__m256i a, __m256i b, __m256i c)\n#include <lasxintrin.h>\nInstruction: xvmaddwev.d.w xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nMultiply even-positioned signed 32-bit elements in `b` and signed elements in `c`, add to 64-bit elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] =\n      (s64)(s32)b.word[2 * i] * (s64)(s32)c.word[2 * i] + (s64)a.dword[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmaddwev_d_w (__m256i a, __m256i b, __m256i c)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmaddwev.d.w xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply even-positioned signed 32-bit elements in <code>b</code> and signed elements in <code>c</code>, add to 64-bit elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] =\n      (s64)(s32)b.word[2 * i] * (s64)(s32)c.word[2 * i] + (s64)a.dword[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmaddwev_d_wu (__m256i a, __m256i b, __m256i c)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmaddwev_d_wu (__m256i a, __m256i b, __m256i c)\n#include <lasxintrin.h>\nInstruction: xvmaddwev.d.wu xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nMultiply even-positioned unsigned 32-bit elements in `b` and unsigned elements in `c`, add to 64-bit elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] =\n      (u64)(u32)b.word[2 * i] * (u64)(u32)c.word[2 * i] + (u64)a.dword[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmaddwev_d_wu (__m256i a, __m256i b, __m256i c)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmaddwev.d.wu xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply even-positioned unsigned 32-bit elements in <code>b</code> and unsigned elements in <code>c</code>, add to 64-bit elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] =\n      (u64)(u32)b.word[2 * i] * (u64)(u32)c.word[2 * i] + (u64)a.dword[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmaddwev_d_wu_w (__m256i a, __m256i b, __m256i c)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmaddwev_d_wu_w (__m256i a, __m256i b, __m256i c)\n#include <lasxintrin.h>\nInstruction: xvmaddwev.d.wu.w xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nMultiply even-positioned unsigned 32-bit elements in `b` and signed elements in `c`, add to 64-bit elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] =\n      (u64)(u32)b.word[2 * i] * (s64)(s32)c.word[2 * i] + (s64)a.dword[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmaddwev_d_wu_w (__m256i a, __m256i b, __m256i c)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmaddwev.d.wu.w xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply even-positioned unsigned 32-bit elements in <code>b</code> and signed elements in <code>c</code>, add to 64-bit elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] =\n      (u64)(u32)b.word[2 * i] * (s64)(s32)c.word[2 * i] + (s64)a.dword[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmaddwev_h_b (__m256i a, __m256i b, __m256i c)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmaddwev_h_b (__m256i a, __m256i b, __m256i c)\n#include <lasxintrin.h>\nInstruction: xvmaddwev.h.b xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nMultiply even-positioned signed 8-bit elements in `b` and signed elements in `c`, add to 16-bit elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] =\n      (s16)(s8)b.byte[2 * i] * (s16)(s8)c.byte[2 * i] + (s16)a.half[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmaddwev_h_b (__m256i a, __m256i b, __m256i c)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmaddwev.h.b xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply even-positioned signed 8-bit elements in <code>b</code> and signed elements in <code>c</code>, add to 16-bit elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] =\n      (s16)(s8)b.byte[2 * i] * (s16)(s8)c.byte[2 * i] + (s16)a.half[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmaddwev_h_bu (__m256i a, __m256i b, __m256i c)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmaddwev_h_bu (__m256i a, __m256i b, __m256i c)\n#include <lasxintrin.h>\nInstruction: xvmaddwev.h.bu xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nMultiply even-positioned unsigned 8-bit elements in `b` and unsigned elements in `c`, add to 16-bit elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] =\n      (u16)(u8)b.byte[2 * i] * (u16)(u8)c.byte[2 * i] + (u16)a.half[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmaddwev_h_bu (__m256i a, __m256i b, __m256i c)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmaddwev.h.bu xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply even-positioned unsigned 8-bit elements in <code>b</code> and unsigned elements in <code>c</code>, add to 16-bit elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] =\n      (u16)(u8)b.byte[2 * i] * (u16)(u8)c.byte[2 * i] + (u16)a.half[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmaddwev_h_bu_b (__m256i a, __m256i b, __m256i c)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmaddwev_h_bu_b (__m256i a, __m256i b, __m256i c)\n#include <lasxintrin.h>\nInstruction: xvmaddwev.h.bu.b xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nMultiply even-positioned unsigned 8-bit elements in `b` and signed elements in `c`, add to 16-bit elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] =\n      (u16)(u8)b.byte[2 * i] * (s16)(s8)c.byte[2 * i] + (s16)a.half[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmaddwev_h_bu_b (__m256i a, __m256i b, __m256i c)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmaddwev.h.bu.b xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply even-positioned unsigned 8-bit elements in <code>b</code> and signed elements in <code>c</code>, add to 16-bit elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] =\n      (u16)(u8)b.byte[2 * i] * (s16)(s8)c.byte[2 * i] + (s16)a.half[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmaddwev_q_d (__m256i a, __m256i b, __m256i c)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmaddwev_q_d (__m256i a, __m256i b, __m256i c)\n#include <lasxintrin.h>\nInstruction: xvmaddwev.q.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nMultiply even-positioned signed 64-bit elements in `b` and signed elements in `c`, add to 128-bit elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.qword[i] =\n      (s128)(s64)b.dword[2 * i] * (s128)(s64)c.dword[2 * i] + (s128)a.qword[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 7 | 1.14 |\n| 3C5000 | 7 | 1.14 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmaddwev_q_d (__m256i a, __m256i b, __m256i c)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmaddwev.q.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply even-positioned signed 64-bit elements in <code>b</code> and signed elements in <code>c</code>, add to 128-bit elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.qword[i] =\n      (s128)(s64)b.dword[2 * i] * (s128)(s64)c.dword[2 * i] + (s128)a.qword[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>7</td>\n<td>1.14</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>7</td>\n<td>1.14</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmaddwev_q_du (__m256i a, __m256i b, __m256i c)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmaddwev_q_du (__m256i a, __m256i b, __m256i c)\n#include <lasxintrin.h>\nInstruction: xvmaddwev.q.du xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nMultiply even-positioned unsigned 64-bit elements in `b` and unsigned elements in `c`, add to 128-bit elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.qword[i] =\n      (u128)(u64)b.dword[2 * i] * (u128)(u64)c.dword[2 * i] + (u128)a.qword[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 7 | 1.14 |\n| 3C5000 | 7 | 1.14 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmaddwev_q_du (__m256i a, __m256i b, __m256i c)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmaddwev.q.du xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply even-positioned unsigned 64-bit elements in <code>b</code> and unsigned elements in <code>c</code>, add to 128-bit elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.qword[i] =\n      (u128)(u64)b.dword[2 * i] * (u128)(u64)c.dword[2 * i] + (u128)a.qword[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>7</td>\n<td>1.14</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>7</td>\n<td>1.14</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmaddwev_q_du_d (__m256i a, __m256i b, __m256i c)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmaddwev_q_du_d (__m256i a, __m256i b, __m256i c)\n#include <lasxintrin.h>\nInstruction: xvmaddwev.q.du.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nMultiply even-positioned unsigned 64-bit elements in `b` and signed elements in `c`, add to 128-bit elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.qword[i] =\n      (u128)(u64)b.dword[2 * i] * (s128)(s64)c.dword[2 * i] + (s128)a.qword[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 7 | 1.14 |\n| 3C5000 | 7 | 1.14 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmaddwev_q_du_d (__m256i a, __m256i b, __m256i c)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmaddwev.q.du.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply even-positioned unsigned 64-bit elements in <code>b</code> and signed elements in <code>c</code>, add to 128-bit elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.qword[i] =\n      (u128)(u64)b.dword[2 * i] * (s128)(s64)c.dword[2 * i] + (s128)a.qword[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>7</td>\n<td>1.14</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>7</td>\n<td>1.14</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmaddwev_w_h (__m256i a, __m256i b, __m256i c)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmaddwev_w_h (__m256i a, __m256i b, __m256i c)\n#include <lasxintrin.h>\nInstruction: xvmaddwev.w.h xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nMultiply even-positioned signed 16-bit elements in `b` and signed elements in `c`, add to 32-bit elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] =\n      (s32)(s16)b.half[2 * i] * (s32)(s16)c.half[2 * i] + (s32)a.word[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmaddwev_w_h (__m256i a, __m256i b, __m256i c)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmaddwev.w.h xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply even-positioned signed 16-bit elements in <code>b</code> and signed elements in <code>c</code>, add to 32-bit elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] =\n      (s32)(s16)b.half[2 * i] * (s32)(s16)c.half[2 * i] + (s32)a.word[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmaddwev_w_hu (__m256i a, __m256i b, __m256i c)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmaddwev_w_hu (__m256i a, __m256i b, __m256i c)\n#include <lasxintrin.h>\nInstruction: xvmaddwev.w.hu xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nMultiply even-positioned unsigned 16-bit elements in `b` and unsigned elements in `c`, add to 32-bit elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] =\n      (u32)(u16)b.half[2 * i] * (u32)(u16)c.half[2 * i] + (u32)a.word[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmaddwev_w_hu (__m256i a, __m256i b, __m256i c)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmaddwev.w.hu xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply even-positioned unsigned 16-bit elements in <code>b</code> and unsigned elements in <code>c</code>, add to 32-bit elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] =\n      (u32)(u16)b.half[2 * i] * (u32)(u16)c.half[2 * i] + (u32)a.word[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmaddwev_w_hu_h (__m256i a, __m256i b, __m256i c)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmaddwev_w_hu_h (__m256i a, __m256i b, __m256i c)\n#include <lasxintrin.h>\nInstruction: xvmaddwev.w.hu.h xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nMultiply even-positioned unsigned 16-bit elements in `b` and signed elements in `c`, add to 32-bit elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] =\n      (u32)(u16)b.half[2 * i] * (s32)(s16)c.half[2 * i] + (s32)a.word[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmaddwev_w_hu_h (__m256i a, __m256i b, __m256i c)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmaddwev.w.hu.h xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply even-positioned unsigned 16-bit elements in <code>b</code> and signed elements in <code>c</code>, add to 32-bit elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] =\n      (u32)(u16)b.half[2 * i] * (s32)(s16)c.half[2 * i] + (s32)a.word[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmaddwod_d_w (__m256i a, __m256i b, __m256i c)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmaddwod_d_w (__m256i a, __m256i b, __m256i c)\n#include <lasxintrin.h>\nInstruction: xvmaddwod.d.w xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nMultiply odd-positioned signed 32-bit elements in `b` and signed elements in `c`, add to 64-bit elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = (s64)(s32)b.word[2 * i + 1] * (s64)(s32)c.word[2 * i + 1] +\n                 (s64)a.dword[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmaddwod_d_w (__m256i a, __m256i b, __m256i c)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmaddwod.d.w xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply odd-positioned signed 32-bit elements in <code>b</code> and signed elements in <code>c</code>, add to 64-bit elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = (s64)(s32)b.word[2 * i + 1] * (s64)(s32)c.word[2 * i + 1] +\n                 (s64)a.dword[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmaddwod_d_wu (__m256i a, __m256i b, __m256i c)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmaddwod_d_wu (__m256i a, __m256i b, __m256i c)\n#include <lasxintrin.h>\nInstruction: xvmaddwod.d.wu xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nMultiply odd-positioned unsigned 32-bit elements in `b` and unsigned elements in `c`, add to 64-bit elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = (u64)(u32)b.word[2 * i + 1] * (u64)(u32)c.word[2 * i + 1] +\n                 (u64)a.dword[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmaddwod_d_wu (__m256i a, __m256i b, __m256i c)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmaddwod.d.wu xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply odd-positioned unsigned 32-bit elements in <code>b</code> and unsigned elements in <code>c</code>, add to 64-bit elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = (u64)(u32)b.word[2 * i + 1] * (u64)(u32)c.word[2 * i + 1] +\n                 (u64)a.dword[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmaddwod_d_wu_w (__m256i a, __m256i b, __m256i c)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmaddwod_d_wu_w (__m256i a, __m256i b, __m256i c)\n#include <lasxintrin.h>\nInstruction: xvmaddwod.d.wu.w xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nMultiply odd-positioned unsigned 32-bit elements in `b` and signed elements in `c`, add to 64-bit elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = (u64)(u32)b.word[2 * i + 1] * (s64)(s32)c.word[2 * i + 1] +\n                 (s64)a.dword[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmaddwod_d_wu_w (__m256i a, __m256i b, __m256i c)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmaddwod.d.wu.w xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply odd-positioned unsigned 32-bit elements in <code>b</code> and signed elements in <code>c</code>, add to 64-bit elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = (u64)(u32)b.word[2 * i + 1] * (s64)(s32)c.word[2 * i + 1] +\n                 (s64)a.dword[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmaddwod_h_b (__m256i a, __m256i b, __m256i c)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmaddwod_h_b (__m256i a, __m256i b, __m256i c)\n#include <lasxintrin.h>\nInstruction: xvmaddwod.h.b xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nMultiply odd-positioned signed 8-bit elements in `b` and signed elements in `c`, add to 16-bit elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] =\n      (s16)(s8)b.byte[2 * i + 1] * (s16)(s8)c.byte[2 * i + 1] + (s16)a.half[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmaddwod_h_b (__m256i a, __m256i b, __m256i c)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmaddwod.h.b xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply odd-positioned signed 8-bit elements in <code>b</code> and signed elements in <code>c</code>, add to 16-bit elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] =\n      (s16)(s8)b.byte[2 * i + 1] * (s16)(s8)c.byte[2 * i + 1] + (s16)a.half[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmaddwod_h_bu (__m256i a, __m256i b, __m256i c)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmaddwod_h_bu (__m256i a, __m256i b, __m256i c)\n#include <lasxintrin.h>\nInstruction: xvmaddwod.h.bu xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nMultiply odd-positioned unsigned 8-bit elements in `b` and unsigned elements in `c`, add to 16-bit elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] =\n      (u16)(u8)b.byte[2 * i + 1] * (u16)(u8)c.byte[2 * i + 1] + (u16)a.half[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmaddwod_h_bu (__m256i a, __m256i b, __m256i c)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmaddwod.h.bu xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply odd-positioned unsigned 8-bit elements in <code>b</code> and unsigned elements in <code>c</code>, add to 16-bit elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] =\n      (u16)(u8)b.byte[2 * i + 1] * (u16)(u8)c.byte[2 * i + 1] + (u16)a.half[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmaddwod_h_bu_b (__m256i a, __m256i b, __m256i c)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmaddwod_h_bu_b (__m256i a, __m256i b, __m256i c)\n#include <lasxintrin.h>\nInstruction: xvmaddwod.h.bu.b xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nMultiply odd-positioned unsigned 8-bit elements in `b` and signed elements in `c`, add to 16-bit elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] =\n      (u16)(u8)b.byte[2 * i + 1] * (s16)(s8)c.byte[2 * i + 1] + (s16)a.half[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmaddwod_h_bu_b (__m256i a, __m256i b, __m256i c)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmaddwod.h.bu.b xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply odd-positioned unsigned 8-bit elements in <code>b</code> and signed elements in <code>c</code>, add to 16-bit elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] =\n      (u16)(u8)b.byte[2 * i + 1] * (s16)(s8)c.byte[2 * i + 1] + (s16)a.half[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmaddwod_q_d (__m256i a, __m256i b, __m256i c)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmaddwod_q_d (__m256i a, __m256i b, __m256i c)\n#include <lasxintrin.h>\nInstruction: xvmaddwod.q.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nMultiply odd-positioned signed 64-bit elements in `b` and signed elements in `c`, add to 128-bit elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.qword[i] = (s128)(s64)b.dword[2 * i + 1] * (s128)(s64)c.dword[2 * i + 1] +\n                 (s128)a.qword[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 7 | 1.14 |\n| 3C5000 | 7 | 1.14 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmaddwod_q_d (__m256i a, __m256i b, __m256i c)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmaddwod.q.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply odd-positioned signed 64-bit elements in <code>b</code> and signed elements in <code>c</code>, add to 128-bit elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.qword[i] = (s128)(s64)b.dword[2 * i + 1] * (s128)(s64)c.dword[2 * i + 1] +\n                 (s128)a.qword[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>7</td>\n<td>1.14</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>7</td>\n<td>1.14</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmaddwod_q_du (__m256i a, __m256i b, __m256i c)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmaddwod_q_du (__m256i a, __m256i b, __m256i c)\n#include <lasxintrin.h>\nInstruction: xvmaddwod.q.du xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nMultiply odd-positioned unsigned 64-bit elements in `b` and unsigned elements in `c`, add to 128-bit elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.qword[i] = (u128)(u64)b.dword[2 * i + 1] * (u128)(u64)c.dword[2 * i + 1] +\n                 (u128)a.qword[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 7 | 1.14 |\n| 3C5000 | 7 | 1.14 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmaddwod_q_du (__m256i a, __m256i b, __m256i c)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmaddwod.q.du xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply odd-positioned unsigned 64-bit elements in <code>b</code> and unsigned elements in <code>c</code>, add to 128-bit elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.qword[i] = (u128)(u64)b.dword[2 * i + 1] * (u128)(u64)c.dword[2 * i + 1] +\n                 (u128)a.qword[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>7</td>\n<td>1.14</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>7</td>\n<td>1.14</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmaddwod_q_du_d (__m256i a, __m256i b, __m256i c)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmaddwod_q_du_d (__m256i a, __m256i b, __m256i c)\n#include <lasxintrin.h>\nInstruction: xvmaddwod.q.du.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nMultiply odd-positioned unsigned 64-bit elements in `b` and signed elements in `c`, add to 128-bit elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.qword[i] = (u128)(u64)b.dword[2 * i + 1] * (s128)(s64)c.dword[2 * i + 1] +\n                 (s128)a.qword[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 7 | 1.14 |\n| 3C5000 | 7 | 1.14 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmaddwod_q_du_d (__m256i a, __m256i b, __m256i c)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmaddwod.q.du.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply odd-positioned unsigned 64-bit elements in <code>b</code> and signed elements in <code>c</code>, add to 128-bit elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.qword[i] = (u128)(u64)b.dword[2 * i + 1] * (s128)(s64)c.dword[2 * i + 1] +\n                 (s128)a.qword[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>7</td>\n<td>1.14</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>7</td>\n<td>1.14</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmaddwod_w_h (__m256i a, __m256i b, __m256i c)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmaddwod_w_h (__m256i a, __m256i b, __m256i c)\n#include <lasxintrin.h>\nInstruction: xvmaddwod.w.h xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nMultiply odd-positioned signed 16-bit elements in `b` and signed elements in `c`, add to 32-bit elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = (s32)(s16)b.half[2 * i + 1] * (s32)(s16)c.half[2 * i + 1] +\n                (s32)a.word[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmaddwod_w_h (__m256i a, __m256i b, __m256i c)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmaddwod.w.h xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply odd-positioned signed 16-bit elements in <code>b</code> and signed elements in <code>c</code>, add to 32-bit elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = (s32)(s16)b.half[2 * i + 1] * (s32)(s16)c.half[2 * i + 1] +\n                (s32)a.word[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmaddwod_w_hu (__m256i a, __m256i b, __m256i c)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmaddwod_w_hu (__m256i a, __m256i b, __m256i c)\n#include <lasxintrin.h>\nInstruction: xvmaddwod.w.hu xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nMultiply odd-positioned unsigned 16-bit elements in `b` and unsigned elements in `c`, add to 32-bit elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = (u32)(u16)b.half[2 * i + 1] * (u32)(u16)c.half[2 * i + 1] +\n                (u32)a.word[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmaddwod_w_hu (__m256i a, __m256i b, __m256i c)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmaddwod.w.hu xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply odd-positioned unsigned 16-bit elements in <code>b</code> and unsigned elements in <code>c</code>, add to 32-bit elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = (u32)(u16)b.half[2 * i + 1] * (u32)(u16)c.half[2 * i + 1] +\n                (u32)a.word[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmaddwod_w_hu_h (__m256i a, __m256i b, __m256i c)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmaddwod_w_hu_h (__m256i a, __m256i b, __m256i c)\n#include <lasxintrin.h>\nInstruction: xvmaddwod.w.hu.h xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nMultiply odd-positioned unsigned 16-bit elements in `b` and signed elements in `c`, add to 32-bit elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = (u32)(u16)b.half[2 * i + 1] * (s32)(s16)c.half[2 * i + 1] +\n                (s32)a.word[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmaddwod_w_hu_h (__m256i a, __m256i b, __m256i c)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmaddwod.w.hu.h xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply odd-positioned unsigned 16-bit elements in <code>b</code> and signed elements in <code>c</code>, add to 32-bit elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = (u32)(u16)b.half[2 * i + 1] * (s32)(s16)c.half[2 * i + 1] +\n                (s32)a.word[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmax_b (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmax_b (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvmax.b xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompute elementwise maximum for signed 8-bit elements in `a` and `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 32; i++) {\n  dst.byte[i] = max((s8)a.byte[i], (s8)b.byte[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmax_b (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmax.b xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute elementwise maximum for signed 8-bit elements in <code>a</code> and <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 32; i++) {\n  dst.byte[i] = max((s8)a.byte[i], (s8)b.byte[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmax_bu (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmax_bu (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvmax.bu xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompute elementwise maximum for unsigned 8-bit elements in `a` and `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 32; i++) {\n  dst.byte[i] = max((u8)a.byte[i], (u8)b.byte[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmax_bu (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmax.bu xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute elementwise maximum for unsigned 8-bit elements in <code>a</code> and <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 32; i++) {\n  dst.byte[i] = max((u8)a.byte[i], (u8)b.byte[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmax_d (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmax_d (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvmax.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompute elementwise maximum for signed 64-bit elements in `a` and `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = max((s64)a.dword[i], (s64)b.dword[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmax_d (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmax.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute elementwise maximum for signed 64-bit elements in <code>a</code> and <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = max((s64)a.dword[i], (s64)b.dword[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmax_du (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmax_du (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvmax.du xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompute elementwise maximum for unsigned 64-bit elements in `a` and `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = max((u64)a.dword[i], (u64)b.dword[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmax_du (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmax.du xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute elementwise maximum for unsigned 64-bit elements in <code>a</code> and <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = max((u64)a.dword[i], (u64)b.dword[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmax_h (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmax_h (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvmax.h xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompute elementwise maximum for signed 16-bit elements in `a` and `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = max((s16)a.half[i], (s16)b.half[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmax_h (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmax.h xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute elementwise maximum for signed 16-bit elements in <code>a</code> and <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = max((s16)a.half[i], (s16)b.half[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmax_hu (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmax_hu (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvmax.hu xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompute elementwise maximum for unsigned 16-bit elements in `a` and `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = max((u16)a.half[i], (u16)b.half[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmax_hu (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmax.hu xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute elementwise maximum for unsigned 16-bit elements in <code>a</code> and <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = max((u16)a.half[i], (u16)b.half[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmax_w (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmax_w (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvmax.w xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompute elementwise maximum for signed 32-bit elements in `a` and `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = max((s32)a.word[i], (s32)b.word[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmax_w (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmax.w xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute elementwise maximum for signed 32-bit elements in <code>a</code> and <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = max((s32)a.word[i], (s32)b.word[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmax_wu (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmax_wu (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvmax.wu xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompute elementwise maximum for unsigned 32-bit elements in `a` and `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = max((u32)a.word[i], (u32)b.word[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmax_wu (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmax.wu xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute elementwise maximum for unsigned 32-bit elements in <code>a</code> and <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = max((u32)a.word[i], (u32)b.word[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmaxi_b (__m256i a, imm_n16_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmaxi_b (__m256i a, imm_n16_15 imm)\n#include <lasxintrin.h>\nInstruction: xvmaxi.b xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nCompute elementwise maximum for signed 8-bit elements in `a` and `imm`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 32; i++) {\n  dst.byte[i] = max((s8)a.byte[i], (s8)imm);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmaxi_b (__m256i a, imm_n16_15 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmaxi.b xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute elementwise maximum for signed 8-bit elements in <code>a</code> and <code>imm</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 32; i++) {\n  dst.byte[i] = max((s8)a.byte[i], (s8)imm);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmaxi_bu (__m256i a, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmaxi_bu (__m256i a, imm0_31 imm)\n#include <lasxintrin.h>\nInstruction: xvmaxi.bu xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nCompute elementwise maximum for unsigned 8-bit elements in `a` and `imm`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 32; i++) {\n  dst.byte[i] = max((u8)a.byte[i], (u8)imm);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmaxi_bu (__m256i a, imm0_31 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmaxi.bu xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute elementwise maximum for unsigned 8-bit elements in <code>a</code> and <code>imm</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 32; i++) {\n  dst.byte[i] = max((u8)a.byte[i], (u8)imm);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmaxi_d (__m256i a, imm_n16_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmaxi_d (__m256i a, imm_n16_15 imm)\n#include <lasxintrin.h>\nInstruction: xvmaxi.d xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nCompute elementwise maximum for signed 64-bit elements in `a` and `imm`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = max((s64)a.dword[i], (s64)imm);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmaxi_d (__m256i a, imm_n16_15 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmaxi.d xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute elementwise maximum for signed 64-bit elements in <code>a</code> and <code>imm</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = max((s64)a.dword[i], (s64)imm);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmaxi_du (__m256i a, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmaxi_du (__m256i a, imm0_31 imm)\n#include <lasxintrin.h>\nInstruction: xvmaxi.du xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nCompute elementwise maximum for unsigned 64-bit elements in `a` and `imm`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = max((u64)a.dword[i], (u64)imm);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmaxi_du (__m256i a, imm0_31 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmaxi.du xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute elementwise maximum for unsigned 64-bit elements in <code>a</code> and <code>imm</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = max((u64)a.dword[i], (u64)imm);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmaxi_h (__m256i a, imm_n16_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmaxi_h (__m256i a, imm_n16_15 imm)\n#include <lasxintrin.h>\nInstruction: xvmaxi.h xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nCompute elementwise maximum for signed 16-bit elements in `a` and `imm`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = max((s16)a.half[i], (s16)imm);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmaxi_h (__m256i a, imm_n16_15 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmaxi.h xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute elementwise maximum for signed 16-bit elements in <code>a</code> and <code>imm</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = max((s16)a.half[i], (s16)imm);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmaxi_hu (__m256i a, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmaxi_hu (__m256i a, imm0_31 imm)\n#include <lasxintrin.h>\nInstruction: xvmaxi.hu xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nCompute elementwise maximum for unsigned 16-bit elements in `a` and `imm`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = max((u16)a.half[i], (u16)imm);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmaxi_hu (__m256i a, imm0_31 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmaxi.hu xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute elementwise maximum for unsigned 16-bit elements in <code>a</code> and <code>imm</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = max((u16)a.half[i], (u16)imm);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmaxi_w (__m256i a, imm_n16_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmaxi_w (__m256i a, imm_n16_15 imm)\n#include <lasxintrin.h>\nInstruction: xvmaxi.w xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nCompute elementwise maximum for signed 32-bit elements in `a` and `imm`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = max((s32)a.word[i], (s32)imm);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmaxi_w (__m256i a, imm_n16_15 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmaxi.w xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute elementwise maximum for signed 32-bit elements in <code>a</code> and <code>imm</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = max((s32)a.word[i], (s32)imm);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmaxi_wu (__m256i a, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmaxi_wu (__m256i a, imm0_31 imm)\n#include <lasxintrin.h>\nInstruction: xvmaxi.wu xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nCompute elementwise maximum for unsigned 32-bit elements in `a` and `imm`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = max((u32)a.word[i], (u32)imm);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmaxi_wu (__m256i a, imm0_31 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmaxi.wu xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute elementwise maximum for unsigned 32-bit elements in <code>a</code> and <code>imm</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = max((u32)a.word[i], (u32)imm);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmin_b (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmin_b (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvmin.b xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompute elementwise minimum for signed 8-bit elements in `a` and `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 32; i++) {\n  dst.byte[i] = min((s8)a.byte[i], (s8)b.byte[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmin_b (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmin.b xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute elementwise minimum for signed 8-bit elements in <code>a</code> and <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 32; i++) {\n  dst.byte[i] = min((s8)a.byte[i], (s8)b.byte[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmin_bu (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmin_bu (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvmin.bu xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompute elementwise minimum for unsigned 8-bit elements in `a` and `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 32; i++) {\n  dst.byte[i] = min((u8)a.byte[i], (u8)b.byte[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmin_bu (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmin.bu xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute elementwise minimum for unsigned 8-bit elements in <code>a</code> and <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 32; i++) {\n  dst.byte[i] = min((u8)a.byte[i], (u8)b.byte[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmin_d (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmin_d (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvmin.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompute elementwise minimum for signed 64-bit elements in `a` and `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = min((s64)a.dword[i], (s64)b.dword[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmin_d (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmin.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute elementwise minimum for signed 64-bit elements in <code>a</code> and <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = min((s64)a.dword[i], (s64)b.dword[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmin_du (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmin_du (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvmin.du xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompute elementwise minimum for unsigned 64-bit elements in `a` and `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = min((u64)a.dword[i], (u64)b.dword[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmin_du (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmin.du xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute elementwise minimum for unsigned 64-bit elements in <code>a</code> and <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = min((u64)a.dword[i], (u64)b.dword[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmin_h (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmin_h (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvmin.h xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompute elementwise minimum for signed 16-bit elements in `a` and `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = min((s16)a.half[i], (s16)b.half[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmin_h (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmin.h xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute elementwise minimum for signed 16-bit elements in <code>a</code> and <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = min((s16)a.half[i], (s16)b.half[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmin_hu (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmin_hu (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvmin.hu xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompute elementwise minimum for unsigned 16-bit elements in `a` and `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = min((u16)a.half[i], (u16)b.half[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmin_hu (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmin.hu xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute elementwise minimum for unsigned 16-bit elements in <code>a</code> and <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = min((u16)a.half[i], (u16)b.half[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmin_w (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmin_w (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvmin.w xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompute elementwise minimum for signed 32-bit elements in `a` and `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = min((s32)a.word[i], (s32)b.word[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmin_w (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmin.w xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute elementwise minimum for signed 32-bit elements in <code>a</code> and <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = min((s32)a.word[i], (s32)b.word[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmin_wu (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmin_wu (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvmin.wu xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompute elementwise minimum for unsigned 32-bit elements in `a` and `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = min((u32)a.word[i], (u32)b.word[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmin_wu (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmin.wu xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute elementwise minimum for unsigned 32-bit elements in <code>a</code> and <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = min((u32)a.word[i], (u32)b.word[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmini_b (__m256i a, imm_n16_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmini_b (__m256i a, imm_n16_15 imm)\n#include <lasxintrin.h>\nInstruction: xvmini.b xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nCompute elementwise minimum for signed 8-bit elements in `a` and `imm`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 32; i++) {\n  dst.byte[i] = min((s8)a.byte[i], (s8)imm);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmini_b (__m256i a, imm_n16_15 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmini.b xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute elementwise minimum for signed 8-bit elements in <code>a</code> and <code>imm</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 32; i++) {\n  dst.byte[i] = min((s8)a.byte[i], (s8)imm);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmini_bu (__m256i a, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmini_bu (__m256i a, imm0_31 imm)\n#include <lasxintrin.h>\nInstruction: xvmini.bu xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nCompute elementwise minimum for unsigned 8-bit elements in `a` and `imm`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 32; i++) {\n  dst.byte[i] = min((u8)a.byte[i], (u8)imm);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmini_bu (__m256i a, imm0_31 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmini.bu xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute elementwise minimum for unsigned 8-bit elements in <code>a</code> and <code>imm</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 32; i++) {\n  dst.byte[i] = min((u8)a.byte[i], (u8)imm);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmini_d (__m256i a, imm_n16_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmini_d (__m256i a, imm_n16_15 imm)\n#include <lasxintrin.h>\nInstruction: xvmini.d xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nCompute elementwise minimum for signed 64-bit elements in `a` and `imm`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = min((s64)a.dword[i], (s64)imm);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmini_d (__m256i a, imm_n16_15 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmini.d xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute elementwise minimum for signed 64-bit elements in <code>a</code> and <code>imm</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = min((s64)a.dword[i], (s64)imm);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmini_du (__m256i a, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmini_du (__m256i a, imm0_31 imm)\n#include <lasxintrin.h>\nInstruction: xvmini.du xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nCompute elementwise minimum for unsigned 64-bit elements in `a` and `imm`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = min((u64)a.dword[i], (u64)imm);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmini_du (__m256i a, imm0_31 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmini.du xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute elementwise minimum for unsigned 64-bit elements in <code>a</code> and <code>imm</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = min((u64)a.dword[i], (u64)imm);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmini_h (__m256i a, imm_n16_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmini_h (__m256i a, imm_n16_15 imm)\n#include <lasxintrin.h>\nInstruction: xvmini.h xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nCompute elementwise minimum for signed 16-bit elements in `a` and `imm`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = min((s16)a.half[i], (s16)imm);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmini_h (__m256i a, imm_n16_15 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmini.h xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute elementwise minimum for signed 16-bit elements in <code>a</code> and <code>imm</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = min((s16)a.half[i], (s16)imm);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmini_hu (__m256i a, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmini_hu (__m256i a, imm0_31 imm)\n#include <lasxintrin.h>\nInstruction: xvmini.hu xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nCompute elementwise minimum for unsigned 16-bit elements in `a` and `imm`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = min((u16)a.half[i], (u16)imm);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmini_hu (__m256i a, imm0_31 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmini.hu xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute elementwise minimum for unsigned 16-bit elements in <code>a</code> and <code>imm</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = min((u16)a.half[i], (u16)imm);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmini_w (__m256i a, imm_n16_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmini_w (__m256i a, imm_n16_15 imm)\n#include <lasxintrin.h>\nInstruction: xvmini.w xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nCompute elementwise minimum for signed 32-bit elements in `a` and `imm`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = min((s32)a.word[i], (s32)imm);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmini_w (__m256i a, imm_n16_15 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmini.w xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute elementwise minimum for signed 32-bit elements in <code>a</code> and <code>imm</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = min((s32)a.word[i], (s32)imm);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmini_wu (__m256i a, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmini_wu (__m256i a, imm0_31 imm)\n#include <lasxintrin.h>\nInstruction: xvmini.wu xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nCompute elementwise minimum for unsigned 32-bit elements in `a` and `imm`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = min((u32)a.word[i], (u32)imm);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmini_wu (__m256i a, imm0_31 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmini.wu xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute elementwise minimum for unsigned 32-bit elements in <code>a</code> and <code>imm</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = min((u32)a.word[i], (u32)imm);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmod_b (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmod_b (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvmod.b xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nModulo residual signed 8-bit elements in `a` by elements in `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 32; i++) {\n  dst.byte[i] = (b.byte[i] == 0) ? 0 : ((s8)a.byte[i] % (s8)b.byte[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 29, 41 | 0.06(1/15.5) |\n| 3C5000 | 29, 33 | 0.05(1/21.5) |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmod_b (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmod.b xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Modulo residual signed 8-bit elements in <code>a</code> by elements in <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 32; i++) {\n  dst.byte[i] = (b.byte[i] == 0) ? 0 : ((s8)a.byte[i] % (s8)b.byte[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>29, 41</td>\n<td>0.06(1/15.5)</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>29, 33</td>\n<td>0.05(1/21.5)</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmod_bu (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmod_bu (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvmod.bu xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nModulo residual unsigned 8-bit elements in `a` by elements in `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 32; i++) {\n  dst.byte[i] = (b.byte[i] == 0) ? 0 : ((u8)a.byte[i] % (u8)b.byte[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 29, 37 | 0.06(1/17.5) |\n| 3C5000 | 29, 37 | 0.05(1/22) |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmod_bu (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmod.bu xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Modulo residual unsigned 8-bit elements in <code>a</code> by elements in <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 32; i++) {\n  dst.byte[i] = (b.byte[i] == 0) ? 0 : ((u8)a.byte[i] % (u8)b.byte[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>29, 37</td>\n<td>0.06(1/17.5)</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>29, 37</td>\n<td>0.05(1/22)</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmod_d (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmod_d (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvmod.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nModulo residual signed 64-bit elements in `a` by elements in `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = (b.dword[i] == 0) ? 0 : ((s64)a.dword[i] % (s64)b.dword[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 8, 10 | 0.25(1/4) |\n| 3C5000 | 8, 10 | 0.11(1/9.5) |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmod_d (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmod.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Modulo residual signed 64-bit elements in <code>a</code> by elements in <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = (b.dword[i] == 0) ? 0 : ((s64)a.dword[i] % (s64)b.dword[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>8, 10</td>\n<td>0.25(1/4)</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>8, 10</td>\n<td>0.11(1/9.5)</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmod_du (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmod_du (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvmod.du xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nModulo residual unsigned 64-bit elements in `a` by elements in `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = (b.dword[i] == 0) ? 0 : ((u64)a.dword[i] % (u64)b.dword[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 8, 10 | 0.25(1/4) |\n| 3C5000 | 8, 10 | 0.11(1/9.5) |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmod_du (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmod.du xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Modulo residual unsigned 64-bit elements in <code>a</code> by elements in <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = (b.dword[i] == 0) ? 0 : ((u64)a.dword[i] % (u64)b.dword[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>8, 10</td>\n<td>0.25(1/4)</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>8, 10</td>\n<td>0.11(1/9.5)</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmod_h (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmod_h (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvmod.h xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nModulo residual signed 16-bit elements in `a` by elements in `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = (b.half[i] == 0) ? 0 : ((s16)a.half[i] % (s16)b.half[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 17, 21 | 0.12(1/8.5) |\n| 3C5000 | 17, 21 | 0.07(1/13.5) |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmod_h (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmod.h xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Modulo residual signed 16-bit elements in <code>a</code> by elements in <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = (b.half[i] == 0) ? 0 : ((s16)a.half[i] % (s16)b.half[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>17, 21</td>\n<td>0.12(1/8.5)</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>17, 21</td>\n<td>0.07(1/13.5)</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmod_hu (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmod_hu (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvmod.hu xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nModulo residual unsigned 16-bit elements in `a` by elements in `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = (b.half[i] == 0) ? 0 : ((u16)a.half[i] % (u16)b.half[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 17, 25 | 0.11(1/9.5) |\n| 3C5000 | 17, 23 | 0.06(1/16) |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmod_hu (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmod.hu xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Modulo residual unsigned 16-bit elements in <code>a</code> by elements in <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = (b.half[i] == 0) ? 0 : ((u16)a.half[i] % (u16)b.half[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>17, 25</td>\n<td>0.11(1/9.5)</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>17, 23</td>\n<td>0.06(1/16)</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmod_w (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmod_w (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvmod.w xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nModulo residual signed 32-bit elements in `a` by elements in `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = (b.word[i] == 0) ? 0 : ((s32)a.word[i] % (s32)b.word[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 11, 13 | 0.18(1/5.5) |\n| 3C5000 | 11, 15 | 0.07(1/13.5) |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmod_w (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmod.w xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Modulo residual signed 32-bit elements in <code>a</code> by elements in <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = (b.word[i] == 0) ? 0 : ((s32)a.word[i] % (s32)b.word[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>11, 13</td>\n<td>0.18(1/5.5)</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>11, 15</td>\n<td>0.07(1/13.5)</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmod_wu (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmod_wu (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvmod.wu xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nModulo residual unsigned 32-bit elements in `a` by elements in `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = (b.word[i] == 0) ? 0 : ((u32)a.word[i] % (u32)b.word[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 11, 13 | 0.18(1/5.5) |\n| 3C5000 | 11, 15 | 0.06(1/16) |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmod_wu (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmod.wu xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Modulo residual unsigned 32-bit elements in <code>a</code> by elements in <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = (b.word[i] == 0) ? 0 : ((u32)a.word[i] % (u32)b.word[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>11, 13</td>\n<td>0.18(1/5.5)</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>11, 15</td>\n<td>0.06(1/16)</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmskgez_b (__m256i a)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmskgez_b (__m256i a)\n#include <lasxintrin.h>\nInstruction: xvmskgez.b xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nFor each 8-bit element in `a`, if the element is greater than or equal to zero, set one bit in `dst`, otherwise clear it.\n\n\n\n\n### Examples\n\n```c++\n__m256i __lasx_xvmskgez_b(__m256i{0x1122334455667788, 0x99aabbccddeeff00, 0xabababab12121212, 0x1234567812345678})\n= 0x00000000000001fe 0x0000000000000000 0x000000000000ff0f 0x0000000000000000\n__m256i __lasx_xvmskgez_b(__m256i{0x0000191100000000, 0x00a1000011b11c11, 0x1181000008010101, 0x0000000000000000})\n= 0x000000000000bbff 0x0000000000000000 0x000000000000ffbf 0x0000000000000000\n```\n\n\n### Operation\n\n```c++\nu64 m = 0x8080808080808080;\nu64 c = m & a.dword[0];\nc |= c << 7;\nc |= c << 14;\nc |= c << 28;\nc >>= 56;\ndst.dword[0] = c;\nc = m & a.dword[1];\nc |= c << 7;\nc |= c << 14;\nc |= c << 28;\nc >>= 56;\ndst.dword[0] |= c << 8;\ndst.dword[0] = (u16)~dst.dword[0];\ndst.dword[1] = 0;\n\nc = m & a.dword[2];\nc |= c << 7;\nc |= c << 14;\nc |= c << 28;\nc >>= 56;\ndst.dword[2] = c;\nc = m & a.dword[3];\nc |= c << 7;\nc |= c << 14;\nc |= c << 28;\nc >>= 56;\ndst.dword[2] |= c << 8;\ndst.dword[2] = (u16)~dst.dword[2];\ndst.dword[3] = 0;\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmskgez_b (__m256i a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmskgez.b xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>For each 8-bit element in <code>a</code>, if the element is greater than or equal to zero, set one bit in <code>dst</code>, otherwise clear it.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmskgez_b(__m256i{0x1122334455667788, 0x99aabbccddeeff00, 0xabababab12121212, 0x1234567812345678})\n= 0x00000000000001fe 0x0000000000000000 0x000000000000ff0f 0x0000000000000000\n__m256i __lasx_xvmskgez_b(__m256i{0x0000191100000000, 0x00a1000011b11c11, 0x1181000008010101, 0x0000000000000000})\n= 0x000000000000bbff 0x0000000000000000 0x000000000000ffbf 0x0000000000000000\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">u64 m = 0x8080808080808080;\nu64 c = m &amp; a.dword[0];\nc |= c &lt;&lt; 7;\nc |= c &lt;&lt; 14;\nc |= c &lt;&lt; 28;\nc &gt;&gt;= 56;\ndst.dword[0] = c;\nc = m &amp; a.dword[1];\nc |= c &lt;&lt; 7;\nc |= c &lt;&lt; 14;\nc |= c &lt;&lt; 28;\nc &gt;&gt;= 56;\ndst.dword[0] |= c &lt;&lt; 8;\ndst.dword[0] = (u16)~dst.dword[0];\ndst.dword[1] = 0;\n\nc = m &amp; a.dword[2];\nc |= c &lt;&lt; 7;\nc |= c &lt;&lt; 14;\nc |= c &lt;&lt; 28;\nc &gt;&gt;= 56;\ndst.dword[2] = c;\nc = m &amp; a.dword[3];\nc |= c &lt;&lt; 7;\nc |= c &lt;&lt; 14;\nc |= c &lt;&lt; 28;\nc &gt;&gt;= 56;\ndst.dword[2] |= c &lt;&lt; 8;\ndst.dword[2] = (u16)~dst.dword[2];\ndst.dword[3] = 0;\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmskltz_b (__m256i a)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmskltz_b (__m256i a)\n#include <lasxintrin.h>\nInstruction: xvmskltz.b xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nFor each 8-bit element in `a`, if the element is less than zero, set one bit in `dst`, otherwise clear it.\n\n\n\n\n### Examples\n\n```c++\n__m256i __lasx_xvmskltz_b(__m256i{0x1122334455667788, 0x99aabbccddeeff00, 0xabababab12121212, 0x1234567812345678})\n= 0x000000000000fe01 0x0000000000000000 0x00000000000000f0 0x0000000000000000\n__m256i __lasx_xvmskltz_b(__m256i{0x0000118100000000, 0x0081000081111118, 0x1181000001010801, 0x0000000000000000})\n= 0x0000000000004810 0x0000000000000000 0x0000000000000040 0x0000000000000000\n```\n\n\n### Operation\n\n```c++\nu64 m = 0x8080808080808080;\nu64 c = m & a.dword[0];\nc |= c << 7;\nc |= c << 14;\nc |= c << 28;\nc >>= 56;\ndst.dword[0] = c;\nc = m & a.dword[1];\nc |= c << 7;\nc |= c << 14;\nc |= c << 28;\nc >>= 56;\ndst.dword[0] |= c << 8;\ndst.dword[1] = 0;\n\nc = m & a.dword[2];\nc |= c << 7;\nc |= c << 14;\nc |= c << 28;\nc >>= 56;\ndst.dword[2] = c;\nc = m & a.dword[3];\nc |= c << 7;\nc |= c << 14;\nc |= c << 28;\nc >>= 56;\ndst.dword[2] |= c << 8;\ndst.dword[3] = 0;\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmskltz_b (__m256i a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmskltz.b xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>For each 8-bit element in <code>a</code>, if the element is less than zero, set one bit in <code>dst</code>, otherwise clear it.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmskltz_b(__m256i{0x1122334455667788, 0x99aabbccddeeff00, 0xabababab12121212, 0x1234567812345678})\n= 0x000000000000fe01 0x0000000000000000 0x00000000000000f0 0x0000000000000000\n__m256i __lasx_xvmskltz_b(__m256i{0x0000118100000000, 0x0081000081111118, 0x1181000001010801, 0x0000000000000000})\n= 0x0000000000004810 0x0000000000000000 0x0000000000000040 0x0000000000000000\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">u64 m = 0x8080808080808080;\nu64 c = m &amp; a.dword[0];\nc |= c &lt;&lt; 7;\nc |= c &lt;&lt; 14;\nc |= c &lt;&lt; 28;\nc &gt;&gt;= 56;\ndst.dword[0] = c;\nc = m &amp; a.dword[1];\nc |= c &lt;&lt; 7;\nc |= c &lt;&lt; 14;\nc |= c &lt;&lt; 28;\nc &gt;&gt;= 56;\ndst.dword[0] |= c &lt;&lt; 8;\ndst.dword[1] = 0;\n\nc = m &amp; a.dword[2];\nc |= c &lt;&lt; 7;\nc |= c &lt;&lt; 14;\nc |= c &lt;&lt; 28;\nc &gt;&gt;= 56;\ndst.dword[2] = c;\nc = m &amp; a.dword[3];\nc |= c &lt;&lt; 7;\nc |= c &lt;&lt; 14;\nc |= c &lt;&lt; 28;\nc &gt;&gt;= 56;\ndst.dword[2] |= c &lt;&lt; 8;\ndst.dword[3] = 0;\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmskltz_d (__m256i a)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmskltz_d (__m256i a)\n#include <lasxintrin.h>\nInstruction: xvmskltz.d xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nFor each 64-bit element in `a`, if the element is less than zero, set one bit in `dst`, otherwise clear it.\n\n\n\n\n### Examples\n\n```c++\n__m256i __lasx_xvmskltz_d(__m256i{0x1122334455667788, 0x99aabbccddeeff00, 0xabababab12121212, 0x1234567812345678})\n= 0x0000000000000002 0x0000000000000000 0x0000000000000001 0x0000000000000000\n__m256i __lasx_xvmskltz_d(__m256i{0x0000111800000000, 0x0081000081111111, 0x8111000008010101, 0x0000000000000000})\n= 0x0000000000000000 0x0000000000000000 0x0000000000000001 0x0000000000000000\n```\n\n\n### Operation\n\n```c++\nu64 m = 0x8000000000000000;\nu64 c = m & a.dword[0];\nc >>= 63;\ndst.dword[0] = c;\nc = m & a.dword[1];\nc >>= 63;\ndst.dword[0] |= c << 1;\ndst.dword[1] = 0;\n\nc = m & a.dword[2];\nc >>= 63;\ndst.dword[2] = c;\nc = m & a.dword[3];\nc >>= 63;\ndst.dword[2] |= c << 1;\ndst.dword[3] = 0;\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmskltz_d (__m256i a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmskltz.d xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>For each 64-bit element in <code>a</code>, if the element is less than zero, set one bit in <code>dst</code>, otherwise clear it.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmskltz_d(__m256i{0x1122334455667788, 0x99aabbccddeeff00, 0xabababab12121212, 0x1234567812345678})\n= 0x0000000000000002 0x0000000000000000 0x0000000000000001 0x0000000000000000\n__m256i __lasx_xvmskltz_d(__m256i{0x0000111800000000, 0x0081000081111111, 0x8111000008010101, 0x0000000000000000})\n= 0x0000000000000000 0x0000000000000000 0x0000000000000001 0x0000000000000000\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">u64 m = 0x8000000000000000;\nu64 c = m &amp; a.dword[0];\nc &gt;&gt;= 63;\ndst.dword[0] = c;\nc = m &amp; a.dword[1];\nc &gt;&gt;= 63;\ndst.dword[0] |= c &lt;&lt; 1;\ndst.dword[1] = 0;\n\nc = m &amp; a.dword[2];\nc &gt;&gt;= 63;\ndst.dword[2] = c;\nc = m &amp; a.dword[3];\nc &gt;&gt;= 63;\ndst.dword[2] |= c &lt;&lt; 1;\ndst.dword[3] = 0;\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmskltz_h (__m256i a)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmskltz_h (__m256i a)\n#include <lasxintrin.h>\nInstruction: xvmskltz.h xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nFor each 16-bit element in `a`, if the element is less than zero, set one bit in `dst`, otherwise clear it.\n\n\n\n\n### Examples\n\n```c++\n__m256i __lasx_xvmskltz_h(__m256i{0x1122334455667788, 0x99aabbccddeeff00, 0xabababab12121212, 0x1234567812345678})\n= 0x00000000000000f0 0x0000000000000000 0x000000000000000c 0x0000000000000000\n__m256i __lasx_xvmskltz_h(__m256i{0x0000818100000000, 0x0018000018181881, 0x1181000008080808, 0x0000000000000000})\n= 0x0000000000000004 0x0000000000000000 0x0000000000000000 0x0000000000000000\n```\n\n\n### Operation\n\n```c++\nu64 m = 0x8000800080008000;\nu64 c = m & a.dword[0];\nc |= c << 15;\nc |= c << 30;\nc >>= 60;\ndst.dword[0] = c;\nc = m & a.dword[1];\nc |= c << 15;\nc |= c << 30;\nc >>= 60;\ndst.dword[0] |= c << 4;\ndst.dword[1] = 0;\n\nc = m & a.dword[2];\nc |= c << 15;\nc |= c << 30;\nc >>= 60;\ndst.dword[2] = c;\nc = m & a.dword[3];\nc |= c << 15;\nc |= c << 30;\nc >>= 60;\ndst.dword[2] |= c << 4;\ndst.dword[3] = 0;\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmskltz_h (__m256i a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmskltz.h xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>For each 16-bit element in <code>a</code>, if the element is less than zero, set one bit in <code>dst</code>, otherwise clear it.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmskltz_h(__m256i{0x1122334455667788, 0x99aabbccddeeff00, 0xabababab12121212, 0x1234567812345678})\n= 0x00000000000000f0 0x0000000000000000 0x000000000000000c 0x0000000000000000\n__m256i __lasx_xvmskltz_h(__m256i{0x0000818100000000, 0x0018000018181881, 0x1181000008080808, 0x0000000000000000})\n= 0x0000000000000004 0x0000000000000000 0x0000000000000000 0x0000000000000000\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">u64 m = 0x8000800080008000;\nu64 c = m &amp; a.dword[0];\nc |= c &lt;&lt; 15;\nc |= c &lt;&lt; 30;\nc &gt;&gt;= 60;\ndst.dword[0] = c;\nc = m &amp; a.dword[1];\nc |= c &lt;&lt; 15;\nc |= c &lt;&lt; 30;\nc &gt;&gt;= 60;\ndst.dword[0] |= c &lt;&lt; 4;\ndst.dword[1] = 0;\n\nc = m &amp; a.dword[2];\nc |= c &lt;&lt; 15;\nc |= c &lt;&lt; 30;\nc &gt;&gt;= 60;\ndst.dword[2] = c;\nc = m &amp; a.dword[3];\nc |= c &lt;&lt; 15;\nc |= c &lt;&lt; 30;\nc &gt;&gt;= 60;\ndst.dword[2] |= c &lt;&lt; 4;\ndst.dword[3] = 0;\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmskltz_w (__m256i a)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmskltz_w (__m256i a)\n#include <lasxintrin.h>\nInstruction: xvmskltz.w xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nFor each 32-bit element in `a`, if the element is less than zero, set one bit in `dst`, otherwise clear it.\n\n\n\n\n### Examples\n\n```c++\n__m256i __lasx_xvmskltz_w(__m256i{0x1122334455667788, 0x99aabbccddeeff00, 0xabababab12121212, 0x1234567812345678})\n= 0x000000000000000c 0x0000000000000000 0x0000000000000002 0x0000000000000000\n__m256i __lasx_xvmskltz_w(__m256i{0x0000811100000000, 0x0018000081111111, 0x8111000001010108, 0x0000000000000000})\n= 0x0000000000000004 0x0000000000000000 0x0000000000000002 0x0000000000000000\n```\n\n\n### Operation\n\n```c++\nu64 m = 0x8000000080000000;\nu64 c = m & a.dword[0];\nc |= c << 31;\nc >>= 62;\ndst.dword[0] = c;\nc = m & a.dword[1];\nc |= c << 31;\nc >>= 62;\ndst.dword[0] |= c << 2;\ndst.dword[1] = 0;\n\nc = m & a.dword[2];\nc |= c << 31;\nc >>= 62;\ndst.dword[2] = c;\nc = m & a.dword[3];\nc |= c << 31;\nc >>= 62;\ndst.dword[2] |= c << 2;\ndst.dword[3] = 0;\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmskltz_w (__m256i a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmskltz.w xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>For each 32-bit element in <code>a</code>, if the element is less than zero, set one bit in <code>dst</code>, otherwise clear it.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmskltz_w(__m256i{0x1122334455667788, 0x99aabbccddeeff00, 0xabababab12121212, 0x1234567812345678})\n= 0x000000000000000c 0x0000000000000000 0x0000000000000002 0x0000000000000000\n__m256i __lasx_xvmskltz_w(__m256i{0x0000811100000000, 0x0018000081111111, 0x8111000001010108, 0x0000000000000000})\n= 0x0000000000000004 0x0000000000000000 0x0000000000000002 0x0000000000000000\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">u64 m = 0x8000000080000000;\nu64 c = m &amp; a.dword[0];\nc |= c &lt;&lt; 31;\nc &gt;&gt;= 62;\ndst.dword[0] = c;\nc = m &amp; a.dword[1];\nc |= c &lt;&lt; 31;\nc &gt;&gt;= 62;\ndst.dword[0] |= c &lt;&lt; 2;\ndst.dword[1] = 0;\n\nc = m &amp; a.dword[2];\nc |= c &lt;&lt; 31;\nc &gt;&gt;= 62;\ndst.dword[2] = c;\nc = m &amp; a.dword[3];\nc |= c &lt;&lt; 31;\nc &gt;&gt;= 62;\ndst.dword[2] |= c &lt;&lt; 2;\ndst.dword[3] = 0;\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmsknz_b (__m256i a)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmsknz_b (__m256i a)\n#include <lasxintrin.h>\nInstruction: xvmsknz.b xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nFor each 8-bit element in `a`, if the element is non-zero, set one bit in `dst`, otherwise clear it.\n\n\n\n\n### Examples\n\n```c++\n__m256i __lasx_xvmsknz_b(__m256i{0x1122334455667788, 0x99aabbccddeeff00, 0xabababab12121212, 0x1234567812345678})\n= 0x000000000000feff 0x0000000000000000 0x000000000000ffff 0x0000000000000000\n__m256i __lasx_xvmsknz_b(__m256i{0x0000111100000000, 0x0011000011111111, 0x1111000001010101, 0x0000000000000000})\n= 0x0000000000004f30 0x0000000000000000 0x00000000000000cf 0x0000000000000000\n```\n\n\n### Operation\n\n```c++\nu64 m = 0x7F7F7F7F7F7F7F7F;\nu64 c = ~(((a.dword[0] & m) + m) | a.dword[0] | m);\nc |= c << 7;\nc |= c << 14;\nc |= c << 28;\nc >>= 56;\ndst.dword[0] = c;\nc = ~(((a.dword[1] & m) + m) | a.dword[1] | m);\nc |= c << 7;\nc |= c << 14;\nc |= c << 28;\nc >>= 56;\ndst.dword[0] |= c << 8;\ndst.dword[0] = (u16)~dst.dword[0];\ndst.dword[1] = 0;\n\nc = ~(((a.dword[2] & m) + m) | a.dword[2] | m);\nc |= c << 7;\nc |= c << 14;\nc |= c << 28;\nc >>= 56;\ndst.dword[2] = c;\nc = ~(((a.dword[3] & m) + m) | a.dword[3] | m);\nc |= c << 7;\nc |= c << 14;\nc |= c << 28;\nc >>= 56;\ndst.dword[2] |= c << 8;\ndst.dword[2] = (u16)~dst.dword[2];\ndst.dword[3] = 0;\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmsknz_b (__m256i a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmsknz.b xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>For each 8-bit element in <code>a</code>, if the element is non-zero, set one bit in <code>dst</code>, otherwise clear it.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmsknz_b(__m256i{0x1122334455667788, 0x99aabbccddeeff00, 0xabababab12121212, 0x1234567812345678})\n= 0x000000000000feff 0x0000000000000000 0x000000000000ffff 0x0000000000000000\n__m256i __lasx_xvmsknz_b(__m256i{0x0000111100000000, 0x0011000011111111, 0x1111000001010101, 0x0000000000000000})\n= 0x0000000000004f30 0x0000000000000000 0x00000000000000cf 0x0000000000000000\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">u64 m = 0x7F7F7F7F7F7F7F7F;\nu64 c = ~(((a.dword[0] &amp; m) + m) | a.dword[0] | m);\nc |= c &lt;&lt; 7;\nc |= c &lt;&lt; 14;\nc |= c &lt;&lt; 28;\nc &gt;&gt;= 56;\ndst.dword[0] = c;\nc = ~(((a.dword[1] &amp; m) + m) | a.dword[1] | m);\nc |= c &lt;&lt; 7;\nc |= c &lt;&lt; 14;\nc |= c &lt;&lt; 28;\nc &gt;&gt;= 56;\ndst.dword[0] |= c &lt;&lt; 8;\ndst.dword[0] = (u16)~dst.dword[0];\ndst.dword[1] = 0;\n\nc = ~(((a.dword[2] &amp; m) + m) | a.dword[2] | m);\nc |= c &lt;&lt; 7;\nc |= c &lt;&lt; 14;\nc |= c &lt;&lt; 28;\nc &gt;&gt;= 56;\ndst.dword[2] = c;\nc = ~(((a.dword[3] &amp; m) + m) | a.dword[3] | m);\nc |= c &lt;&lt; 7;\nc |= c &lt;&lt; 14;\nc |= c &lt;&lt; 28;\nc &gt;&gt;= 56;\ndst.dword[2] |= c &lt;&lt; 8;\ndst.dword[2] = (u16)~dst.dword[2];\ndst.dword[3] = 0;\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmsub_b (__m256i a, __m256i b, __m256i c)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmsub_b (__m256i a, __m256i b, __m256i c)\n#include <lasxintrin.h>\nInstruction: xvmsub.b xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nMultiply 8-bit elements in `b` and `c`, negate and add elements in `a`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 32; i++) {\n  dst.byte[i] = -b.byte[i] * c.byte[i] + a.byte[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmsub_b (__m256i a, __m256i b, __m256i c)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmsub.b xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply 8-bit elements in <code>b</code> and <code>c</code>, negate and add elements in <code>a</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 32; i++) {\n  dst.byte[i] = -b.byte[i] * c.byte[i] + a.byte[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmsub_d (__m256i a, __m256i b, __m256i c)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmsub_d (__m256i a, __m256i b, __m256i c)\n#include <lasxintrin.h>\nInstruction: xvmsub.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nMultiply 64-bit elements in `b` and `c`, negate and add elements in `a`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = -b.dword[i] * c.dword[i] + a.dword[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmsub_d (__m256i a, __m256i b, __m256i c)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmsub.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply 64-bit elements in <code>b</code> and <code>c</code>, negate and add elements in <code>a</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = -b.dword[i] * c.dword[i] + a.dword[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmsub_h (__m256i a, __m256i b, __m256i c)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmsub_h (__m256i a, __m256i b, __m256i c)\n#include <lasxintrin.h>\nInstruction: xvmsub.h xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nMultiply 16-bit elements in `b` and `c`, negate and add elements in `a`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = -b.half[i] * c.half[i] + a.half[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmsub_h (__m256i a, __m256i b, __m256i c)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmsub.h xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply 16-bit elements in <code>b</code> and <code>c</code>, negate and add elements in <code>a</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = -b.half[i] * c.half[i] + a.half[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmsub_w (__m256i a, __m256i b, __m256i c)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmsub_w (__m256i a, __m256i b, __m256i c)\n#include <lasxintrin.h>\nInstruction: xvmsub.w xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nMultiply 32-bit elements in `b` and `c`, negate and add elements in `a`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = -b.word[i] * c.word[i] + a.word[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmsub_w (__m256i a, __m256i b, __m256i c)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmsub.w xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply 32-bit elements in <code>b</code> and <code>c</code>, negate and add elements in <code>a</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = -b.word[i] * c.word[i] + a.word[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmuh_b (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmuh_b (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvmuh.b xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nMultiply signed 8-bit elements in `a` and `b`, save the high 8-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 32; i++) {\n  dst.byte[i] = (((s16)(s8)a.byte[i] * (s16)(s8)b.byte[i])) >> 8;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmuh_b (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmuh.b xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply signed 8-bit elements in <code>a</code> and <code>b</code>, save the high 8-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 32; i++) {\n  dst.byte[i] = (((s16)(s8)a.byte[i] * (s16)(s8)b.byte[i])) &gt;&gt; 8;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmuh_bu (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmuh_bu (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvmuh.bu xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nMultiply unsigned 8-bit elements in `a` and `b`, save the high 8-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 32; i++) {\n  dst.byte[i] = (((u16)(u8)a.byte[i] * (u16)(u8)b.byte[i])) >> 8;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmuh_bu (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmuh.bu xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply unsigned 8-bit elements in <code>a</code> and <code>b</code>, save the high 8-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 32; i++) {\n  dst.byte[i] = (((u16)(u8)a.byte[i] * (u16)(u8)b.byte[i])) &gt;&gt; 8;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmuh_d (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmuh_d (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvmuh.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nMultiply signed 64-bit elements in `a` and `b`, save the high 64-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = (((s128)(s64)a.dword[i] * (s128)(s64)b.dword[i])) >> 64;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmuh_d (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmuh.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply signed 64-bit elements in <code>a</code> and <code>b</code>, save the high 64-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = (((s128)(s64)a.dword[i] * (s128)(s64)b.dword[i])) &gt;&gt; 64;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmuh_du (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmuh_du (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvmuh.du xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nMultiply unsigned 64-bit elements in `a` and `b`, save the high 64-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = (((u128)(u64)a.dword[i] * (u128)(u64)b.dword[i])) >> 64;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmuh_du (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmuh.du xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply unsigned 64-bit elements in <code>a</code> and <code>b</code>, save the high 64-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = (((u128)(u64)a.dword[i] * (u128)(u64)b.dword[i])) &gt;&gt; 64;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmuh_h (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmuh_h (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvmuh.h xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nMultiply signed 16-bit elements in `a` and `b`, save the high 16-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = (((s32)(s16)a.half[i] * (s32)(s16)b.half[i])) >> 16;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmuh_h (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmuh.h xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply signed 16-bit elements in <code>a</code> and <code>b</code>, save the high 16-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = (((s32)(s16)a.half[i] * (s32)(s16)b.half[i])) &gt;&gt; 16;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmuh_hu (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmuh_hu (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvmuh.hu xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nMultiply unsigned 16-bit elements in `a` and `b`, save the high 16-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = (((u32)(u16)a.half[i] * (u32)(u16)b.half[i])) >> 16;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmuh_hu (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmuh.hu xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply unsigned 16-bit elements in <code>a</code> and <code>b</code>, save the high 16-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = (((u32)(u16)a.half[i] * (u32)(u16)b.half[i])) &gt;&gt; 16;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmuh_w (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmuh_w (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvmuh.w xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nMultiply signed 32-bit elements in `a` and `b`, save the high 32-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = (((s64)(s32)a.word[i] * (s64)(s32)b.word[i])) >> 32;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmuh_w (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmuh.w xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply signed 32-bit elements in <code>a</code> and <code>b</code>, save the high 32-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = (((s64)(s32)a.word[i] * (s64)(s32)b.word[i])) &gt;&gt; 32;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmuh_wu (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmuh_wu (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvmuh.wu xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nMultiply unsigned 32-bit elements in `a` and `b`, save the high 32-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = (((u64)(u32)a.word[i] * (u64)(u32)b.word[i])) >> 32;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmuh_wu (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmuh.wu xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply unsigned 32-bit elements in <code>a</code> and <code>b</code>, save the high 32-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = (((u64)(u32)a.word[i] * (u64)(u32)b.word[i])) &gt;&gt; 32;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmul_b (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmul_b (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvmul.b xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nMultiply 8-bit elements in `a` and `b`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 32; i++) {\n  dst.byte[i] = a.byte[i] * b.byte[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmul_b (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmul.b xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply 8-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 32; i++) {\n  dst.byte[i] = a.byte[i] * b.byte[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmul_d (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmul_d (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvmul.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nMultiply 64-bit elements in `a` and `b`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = a.dword[i] * b.dword[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmul_d (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmul.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply 64-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = a.dword[i] * b.dword[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmul_h (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmul_h (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvmul.h xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nMultiply 16-bit elements in `a` and `b`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = a.half[i] * b.half[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmul_h (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmul.h xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply 16-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = a.half[i] * b.half[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmul_w (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmul_w (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvmul.w xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nMultiply 32-bit elements in `a` and `b`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = a.word[i] * b.word[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmul_w (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmul.w xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply 32-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = a.word[i] * b.word[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmulwev_d_w (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmulwev_d_w (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvmulwev.d.w xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nMultiply even-positioned signed 32-bit elements in `a` and signed elements in `b`, save the 64-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = (s64)(s32)a.word[2 * i] * (s64)(s32)b.word[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmulwev_d_w (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmulwev.d.w xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply even-positioned signed 32-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 64-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = (s64)(s32)a.word[2 * i] * (s64)(s32)b.word[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmulwev_d_wu (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmulwev_d_wu (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvmulwev.d.wu xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nMultiply even-positioned unsigned 32-bit elements in `a` and unsigned elements in `b`, save the 64-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = (u64)(u32)a.word[2 * i] * (u64)(u32)b.word[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmulwev_d_wu (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmulwev.d.wu xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply even-positioned unsigned 32-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 64-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = (u64)(u32)a.word[2 * i] * (u64)(u32)b.word[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmulwev_d_wu_w (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmulwev_d_wu_w (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvmulwev.d.wu.w xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nMultiply even-positioned unsigned 32-bit elements in `a` and signed elements in `b`, save the 64-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = (u64)(u32)a.word[2 * i] * (s64)(s32)b.word[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmulwev_d_wu_w (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmulwev.d.wu.w xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply even-positioned unsigned 32-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 64-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = (u64)(u32)a.word[2 * i] * (s64)(s32)b.word[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmulwev_h_b (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmulwev_h_b (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvmulwev.h.b xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nMultiply even-positioned signed 8-bit elements in `a` and signed elements in `b`, save the 16-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = (s16)(s8)a.byte[2 * i] * (s16)(s8)b.byte[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmulwev_h_b (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmulwev.h.b xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply even-positioned signed 8-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 16-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = (s16)(s8)a.byte[2 * i] * (s16)(s8)b.byte[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmulwev_h_bu (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmulwev_h_bu (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvmulwev.h.bu xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nMultiply even-positioned unsigned 8-bit elements in `a` and unsigned elements in `b`, save the 16-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = (u16)(u8)a.byte[2 * i] * (u16)(u8)b.byte[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmulwev_h_bu (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmulwev.h.bu xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply even-positioned unsigned 8-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 16-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = (u16)(u8)a.byte[2 * i] * (u16)(u8)b.byte[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmulwev_h_bu_b (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmulwev_h_bu_b (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvmulwev.h.bu.b xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nMultiply even-positioned unsigned 8-bit elements in `a` and signed elements in `b`, save the 16-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = (u16)(u8)a.byte[2 * i] * (s16)(s8)b.byte[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmulwev_h_bu_b (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmulwev.h.bu.b xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply even-positioned unsigned 8-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 16-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = (u16)(u8)a.byte[2 * i] * (s16)(s8)b.byte[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmulwev_q_d (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmulwev_q_d (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvmulwev.q.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nMultiply even-positioned signed 64-bit elements in `a` and signed elements in `b`, save the 128-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.qword[i] = (s128)(s64)a.dword[2 * i] * (s128)(s64)b.dword[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 7 | 2 |\n| 3C5000 | 7 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmulwev_q_d (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmulwev.q.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply even-positioned signed 64-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 128-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.qword[i] = (s128)(s64)a.dword[2 * i] * (s128)(s64)b.dword[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>7</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>7</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmulwev_q_du (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmulwev_q_du (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvmulwev.q.du xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nMultiply even-positioned unsigned 64-bit elements in `a` and unsigned elements in `b`, save the 128-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.qword[i] = (u128)(u64)a.dword[2 * i] * (u128)(u64)b.dword[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 7 | 2 |\n| 3C5000 | 7 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmulwev_q_du (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmulwev.q.du xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply even-positioned unsigned 64-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 128-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.qword[i] = (u128)(u64)a.dword[2 * i] * (u128)(u64)b.dword[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>7</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>7</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmulwev_q_du_d (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmulwev_q_du_d (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvmulwev.q.du.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nMultiply even-positioned unsigned 64-bit elements in `a` and signed elements in `b`, save the 128-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.qword[i] = (u128)(u64)a.dword[2 * i] * (s128)(s64)b.dword[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 7 | 2 |\n| 3C5000 | 7 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmulwev_q_du_d (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmulwev.q.du.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply even-positioned unsigned 64-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 128-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.qword[i] = (u128)(u64)a.dword[2 * i] * (s128)(s64)b.dword[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>7</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>7</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmulwev_w_h (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmulwev_w_h (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvmulwev.w.h xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nMultiply even-positioned signed 16-bit elements in `a` and signed elements in `b`, save the 32-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = (s32)(s16)a.half[2 * i] * (s32)(s16)b.half[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmulwev_w_h (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmulwev.w.h xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply even-positioned signed 16-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 32-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = (s32)(s16)a.half[2 * i] * (s32)(s16)b.half[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmulwev_w_hu (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmulwev_w_hu (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvmulwev.w.hu xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nMultiply even-positioned unsigned 16-bit elements in `a` and unsigned elements in `b`, save the 32-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = (u32)(u16)a.half[2 * i] * (u32)(u16)b.half[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmulwev_w_hu (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmulwev.w.hu xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply even-positioned unsigned 16-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 32-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = (u32)(u16)a.half[2 * i] * (u32)(u16)b.half[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmulwev_w_hu_h (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmulwev_w_hu_h (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvmulwev.w.hu.h xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nMultiply even-positioned unsigned 16-bit elements in `a` and signed elements in `b`, save the 32-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = (u32)(u16)a.half[2 * i] * (s32)(s16)b.half[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmulwev_w_hu_h (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmulwev.w.hu.h xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply even-positioned unsigned 16-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 32-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = (u32)(u16)a.half[2 * i] * (s32)(s16)b.half[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmulwod_d_w (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmulwod_d_w (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvmulwod.d.w xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nMultiply odd-positioned signed 32-bit elements in `a` and signed elements in `b`, save the 64-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = (s64)(s32)a.word[2 * i + 1] * (s64)(s32)b.word[2 * i + 1];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmulwod_d_w (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmulwod.d.w xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply odd-positioned signed 32-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 64-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = (s64)(s32)a.word[2 * i + 1] * (s64)(s32)b.word[2 * i + 1];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmulwod_d_wu (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmulwod_d_wu (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvmulwod.d.wu xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nMultiply odd-positioned unsigned 32-bit elements in `a` and unsigned elements in `b`, save the 64-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = (u64)(u32)a.word[2 * i + 1] * (u64)(u32)b.word[2 * i + 1];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmulwod_d_wu (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmulwod.d.wu xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply odd-positioned unsigned 32-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 64-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = (u64)(u32)a.word[2 * i + 1] * (u64)(u32)b.word[2 * i + 1];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmulwod_d_wu_w (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmulwod_d_wu_w (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvmulwod.d.wu.w xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nMultiply odd-positioned unsigned 32-bit elements in `a` and signed elements in `b`, save the 64-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = (u64)(u32)a.word[2 * i + 1] * (s64)(s32)b.word[2 * i + 1];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmulwod_d_wu_w (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmulwod.d.wu.w xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply odd-positioned unsigned 32-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 64-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = (u64)(u32)a.word[2 * i + 1] * (s64)(s32)b.word[2 * i + 1];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmulwod_h_b (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmulwod_h_b (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvmulwod.h.b xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nMultiply odd-positioned signed 8-bit elements in `a` and signed elements in `b`, save the 16-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = (s16)(s8)a.byte[2 * i + 1] * (s16)(s8)b.byte[2 * i + 1];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmulwod_h_b (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmulwod.h.b xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply odd-positioned signed 8-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 16-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = (s16)(s8)a.byte[2 * i + 1] * (s16)(s8)b.byte[2 * i + 1];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmulwod_h_bu (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmulwod_h_bu (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvmulwod.h.bu xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nMultiply odd-positioned unsigned 8-bit elements in `a` and unsigned elements in `b`, save the 16-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = (u16)(u8)a.byte[2 * i + 1] * (u16)(u8)b.byte[2 * i + 1];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmulwod_h_bu (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmulwod.h.bu xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply odd-positioned unsigned 8-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 16-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = (u16)(u8)a.byte[2 * i + 1] * (u16)(u8)b.byte[2 * i + 1];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmulwod_h_bu_b (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmulwod_h_bu_b (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvmulwod.h.bu.b xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nMultiply odd-positioned unsigned 8-bit elements in `a` and signed elements in `b`, save the 16-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = (u16)(u8)a.byte[2 * i + 1] * (s16)(s8)b.byte[2 * i + 1];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmulwod_h_bu_b (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmulwod.h.bu.b xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply odd-positioned unsigned 8-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 16-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = (u16)(u8)a.byte[2 * i + 1] * (s16)(s8)b.byte[2 * i + 1];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmulwod_q_d (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmulwod_q_d (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvmulwod.q.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nMultiply odd-positioned signed 64-bit elements in `a` and signed elements in `b`, save the 128-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.qword[i] = (s128)(s64)a.dword[2 * i + 1] * (s128)(s64)b.dword[2 * i + 1];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 7 | 2 |\n| 3C5000 | 7 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmulwod_q_d (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmulwod.q.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply odd-positioned signed 64-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 128-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.qword[i] = (s128)(s64)a.dword[2 * i + 1] * (s128)(s64)b.dword[2 * i + 1];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>7</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>7</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmulwod_q_du (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmulwod_q_du (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvmulwod.q.du xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nMultiply odd-positioned unsigned 64-bit elements in `a` and unsigned elements in `b`, save the 128-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] * (u128)(u64)b.dword[2 * i + 1];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 7 | 2 |\n| 3C5000 | 7 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmulwod_q_du (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmulwod.q.du xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply odd-positioned unsigned 64-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 128-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] * (u128)(u64)b.dword[2 * i + 1];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>7</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>7</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmulwod_q_du_d (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmulwod_q_du_d (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvmulwod.q.du.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nMultiply odd-positioned unsigned 64-bit elements in `a` and signed elements in `b`, save the 128-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] * (s128)(s64)b.dword[2 * i + 1];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 7 | 2 |\n| 3C5000 | 7 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmulwod_q_du_d (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmulwod.q.du.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply odd-positioned unsigned 64-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 128-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] * (s128)(s64)b.dword[2 * i + 1];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>7</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>7</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmulwod_w_h (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmulwod_w_h (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvmulwod.w.h xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nMultiply odd-positioned signed 16-bit elements in `a` and signed elements in `b`, save the 32-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = (s32)(s16)a.half[2 * i + 1] * (s32)(s16)b.half[2 * i + 1];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmulwod_w_h (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmulwod.w.h xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply odd-positioned signed 16-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 32-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = (s32)(s16)a.half[2 * i + 1] * (s32)(s16)b.half[2 * i + 1];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmulwod_w_hu (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmulwod_w_hu (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvmulwod.w.hu xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nMultiply odd-positioned unsigned 16-bit elements in `a` and unsigned elements in `b`, save the 32-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = (u32)(u16)a.half[2 * i + 1] * (u32)(u16)b.half[2 * i + 1];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmulwod_w_hu (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmulwod.w.hu xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply odd-positioned unsigned 16-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 32-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = (u32)(u16)a.half[2 * i + 1] * (u32)(u16)b.half[2 * i + 1];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvmulwod_w_hu_h (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvmulwod_w_hu_h (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvmulwod.w.hu.h xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nMultiply odd-positioned unsigned 16-bit elements in `a` and signed elements in `b`, save the 32-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = (u32)(u16)a.half[2 * i + 1] * (s32)(s16)b.half[2 * i + 1];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvmulwod_w_hu_h (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvmulwod.w.hu.h xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Multiply odd-positioned unsigned 16-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 32-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = (u32)(u16)a.half[2 * i + 1] * (s32)(s16)b.half[2 * i + 1];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvneg_b (__m256i a)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvneg_b (__m256i a)\n#include <lasxintrin.h>\nInstruction: xvneg.b xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nNegate 8-bit elements in `a` and save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 32; i++) {\n  dst.byte[i] = -a.byte[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvneg_b (__m256i a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvneg.b xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Negate 8-bit elements in <code>a</code> and save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 32; i++) {\n  dst.byte[i] = -a.byte[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvneg_d (__m256i a)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvneg_d (__m256i a)\n#include <lasxintrin.h>\nInstruction: xvneg.d xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nNegate 64-bit elements in `a` and save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = -a.dword[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvneg_d (__m256i a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvneg.d xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Negate 64-bit elements in <code>a</code> and save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = -a.dword[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvneg_h (__m256i a)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvneg_h (__m256i a)\n#include <lasxintrin.h>\nInstruction: xvneg.h xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nNegate 16-bit elements in `a` and save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = -a.half[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvneg_h (__m256i a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvneg.h xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Negate 16-bit elements in <code>a</code> and save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = -a.half[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvneg_w (__m256i a)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvneg_w (__m256i a)\n#include <lasxintrin.h>\nInstruction: xvneg.w xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nNegate 32-bit elements in `a` and save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = -a.word[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvneg_w (__m256i a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvneg.w xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Negate 32-bit elements in <code>a</code> and save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = -a.word[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvnor_v (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvnor_v (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvnor.v xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompute bitwise NOR between elements in `a` and `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = ~(a.dword[i] | b.dword[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvnor_v (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvnor.v xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute bitwise NOR between elements in <code>a</code> and <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = ~(a.dword[i] | b.dword[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Logical", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvnori_b (__m256i a, imm0_255 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvnori_b (__m256i a, imm0_255 imm)\n#include <lasxintrin.h>\nInstruction: xvnori.b xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nCompute bitwise NOR between elements in `a` and `imm`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 32; i++) {\n  dst.byte[i] = ~(a.byte[i] | imm);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvnori_b (__m256i a, imm0_255 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvnori.b xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute bitwise NOR between elements in <code>a</code> and <code>imm</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 32; i++) {\n  dst.byte[i] = ~(a.byte[i] | imm);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Logical", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvor_v (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvor_v (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvor.v xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompute bitwise OR between elements in `a` and `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = a.dword[i] | b.dword[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvor_v (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvor.v xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute bitwise OR between elements in <code>a</code> and <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = a.dword[i] | b.dword[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Logical", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvori_b (__m256i a, imm0_255 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvori_b (__m256i a, imm0_255 imm)\n#include <lasxintrin.h>\nInstruction: xvori.b xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nCompute bitwise OR between elements in `a` and `imm`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 32; i++) {\n  dst.byte[i] = a.byte[i] | imm;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvori_b (__m256i a, imm0_255 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvori.b xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute bitwise OR between elements in <code>a</code> and <code>imm</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 32; i++) {\n  dst.byte[i] = a.byte[i] | imm;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Logical", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvorn_v (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvorn_v (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvorn.v xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompute bitwise ORN between elements in `a` and `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = a.dword[i] | (~b.dword[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvorn_v (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvorn.v xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute bitwise ORN between elements in <code>a</code> and <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = a.dword[i] | (~b.dword[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Logical", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvpackev_b (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvpackev_b (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvpackev.b xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCollect and pack even-positioned 8-bit elements in `a` and `b` and store `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 32; i++) {\n  dst.byte[i] = (i % 2 == 1) ? a.byte[i - 1] : b.byte[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvpackev_b (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvpackev.b xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Collect and pack even-positioned 8-bit elements in <code>a</code> and <code>b</code> and store <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 32; i++) {\n  dst.byte[i] = (i % 2 == 1) ? a.byte[i - 1] : b.byte[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvpackev_d (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvpackev_d (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvpackev.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCollect and pack even-positioned 64-bit elements in `a` and `b` and store `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = (i % 2 == 1) ? a.dword[i - 1] : b.dword[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvpackev_d (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvpackev.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Collect and pack even-positioned 64-bit elements in <code>a</code> and <code>b</code> and store <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = (i % 2 == 1) ? a.dword[i - 1] : b.dword[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvpackev_h (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvpackev_h (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvpackev.h xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCollect and pack even-positioned 16-bit elements in `a` and `b` and store `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = (i % 2 == 1) ? a.half[i - 1] : b.half[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvpackev_h (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvpackev.h xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Collect and pack even-positioned 16-bit elements in <code>a</code> and <code>b</code> and store <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = (i % 2 == 1) ? a.half[i - 1] : b.half[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvpackev_w (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvpackev_w (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvpackev.w xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCollect and pack even-positioned 32-bit elements in `a` and `b` and store `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = (i % 2 == 1) ? a.word[i - 1] : b.word[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvpackev_w (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvpackev.w xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Collect and pack even-positioned 32-bit elements in <code>a</code> and <code>b</code> and store <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = (i % 2 == 1) ? a.word[i - 1] : b.word[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvpackod_b (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvpackod_b (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvpackod.b xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCollect and pack odd-positioned 8-bit elements in `a` and `b` and store `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 32; i++) {\n  dst.byte[i] = (i % 2 == 1) ? a.byte[i] : b.byte[i + 1];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvpackod_b (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvpackod.b xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Collect and pack odd-positioned 8-bit elements in <code>a</code> and <code>b</code> and store <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 32; i++) {\n  dst.byte[i] = (i % 2 == 1) ? a.byte[i] : b.byte[i + 1];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvpackod_d (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvpackod_d (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvpackod.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCollect and pack odd-positioned 64-bit elements in `a` and `b` and store `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = (i % 2 == 1) ? a.dword[i] : b.dword[i + 1];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvpackod_d (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvpackod.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Collect and pack odd-positioned 64-bit elements in <code>a</code> and <code>b</code> and store <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = (i % 2 == 1) ? a.dword[i] : b.dword[i + 1];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvpackod_h (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvpackod_h (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvpackod.h xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCollect and pack odd-positioned 16-bit elements in `a` and `b` and store `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = (i % 2 == 1) ? a.half[i] : b.half[i + 1];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvpackod_h (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvpackod.h xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Collect and pack odd-positioned 16-bit elements in <code>a</code> and <code>b</code> and store <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = (i % 2 == 1) ? a.half[i] : b.half[i + 1];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvpackod_w (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvpackod_w (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvpackod.w xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCollect and pack odd-positioned 32-bit elements in `a` and `b` and store `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = (i % 2 == 1) ? a.word[i] : b.word[i + 1];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvpackod_w (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvpackod.w xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Collect and pack odd-positioned 32-bit elements in <code>a</code> and <code>b</code> and store <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = (i % 2 == 1) ? a.word[i] : b.word[i + 1];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvpcnt_b (__m256i a)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvpcnt_b (__m256i a)\n#include <lasxintrin.h>\nInstruction: xvpcnt.b xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCount the number of ones (population, popcount) in 8-bit elements in `a`.\n\n\n\n\n### Examples\n\n```c++\n__m256i __lasx_xvpcnt_b(__m256i{ 0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee})\n= 0x0202040204040602 0x0404060406060800 0x0505070202030203 0x0406040606060606\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 32; i++) {\n  dst.byte[i] = popcount(a.byte[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvpcnt_b (__m256i a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvpcnt.b xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Count the number of ones (population, popcount) in 8-bit elements in <code>a</code>.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvpcnt_b(__m256i{ 0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee})\n= 0x0202040204040602 0x0404060406060800 0x0505070202030203 0x0406040606060606\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 32; i++) {\n  dst.byte[i] = popcount(a.byte[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Bitwise Operations", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvpcnt_d (__m256i a)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvpcnt_d (__m256i a)\n#include <lasxintrin.h>\nInstruction: xvpcnt.d xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCount the number of ones (population, popcount) in 64-bit elements in `a`.\n\n\n\n\n### Examples\n\n```c++\n__m256i __lasx_xvpcnt_d(__m256i{ 0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee})\n= 0x000000000000001a 0x0000000000000026 0x000000000000001d 0x000000000000002c\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = popcount(a.dword[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvpcnt_d (__m256i a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvpcnt.d xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Count the number of ones (population, popcount) in 64-bit elements in <code>a</code>.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvpcnt_d(__m256i{ 0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee})\n= 0x000000000000001a 0x0000000000000026 0x000000000000001d 0x000000000000002c\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = popcount(a.dword[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Bitwise Operations", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvpcnt_h (__m256i a)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvpcnt_h (__m256i a)\n#include <lasxintrin.h>\nInstruction: xvpcnt.h xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCount the number of ones (population, popcount) in 16-bit elements in `a`.\n\n\n\n\n### Examples\n\n```c++\n__m256i __lasx_xvpcnt_h(__m256i{ 0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee})\n= 0x0004000600080008 0x0008000a000c0008 0x000a000900050005 0x000a000a000c000c\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = popcount(a.half[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvpcnt_h (__m256i a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvpcnt.h xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Count the number of ones (population, popcount) in 16-bit elements in <code>a</code>.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvpcnt_h(__m256i{ 0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee})\n= 0x0004000600080008 0x0008000a000c0008 0x000a000900050005 0x000a000a000c000c\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = popcount(a.half[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Bitwise Operations", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvpcnt_w (__m256i a)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvpcnt_w (__m256i a)\n#include <lasxintrin.h>\nInstruction: xvpcnt.w xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCount the number of ones (population, popcount) in 32-bit elements in `a`.\n\n\n\n\n### Examples\n\n```c++\n__m256i __lasx_xvpcnt_w(__m256i{ 0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee})\n= 0x0000000a00000010 0x0000001200000014 0x000000130000000a 0x0000001400000018\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = popcount(a.word[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvpcnt_w (__m256i a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvpcnt.w xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Count the number of ones (population, popcount) in 32-bit elements in <code>a</code>.</p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvpcnt_w(__m256i{ 0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee})\n= 0x0000000a00000010 0x0000001200000014 0x000000130000000a 0x0000001400000018\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = popcount(a.word[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Bitwise Operations", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvperm_w (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvperm_w (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvperm.w xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nPermute words from `a` with indices recorded in `b` and store into `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = a.word[b.word[i] % 0x8];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 4 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvperm_w (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvperm.w xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Permute words from <code>a</code> with indices recorded in <code>b</code> and store into <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = a.word[b.word[i] % 0x8];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Permutation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvpermi_d (__m256i a, imm0_255 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvpermi_d (__m256i a, imm0_255 imm)\n#include <lasxintrin.h>\nInstruction: xvpermi.d xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nPermute double words from `a` and `b` with indices recorded in `imm` and store into `dst`.\n\n\n\n\n\n### Operation\n\n```c++\ndst.dword[0] = a.dword[imm & 0x3];\ndst.dword[1] = a.dword[(imm >> 2) & 0x3];\ndst.dword[2] = a.dword[(imm >> 4) & 0x3];\ndst.dword[3] = a.dword[(imm >> 6) & 0x3];\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 4 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvpermi_d (__m256i a, imm0_255 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvpermi.d xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Permute double words from <code>a</code> and <code>b</code> with indices recorded in <code>imm</code> and store into <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">dst.dword[0] = a.dword[imm &amp; 0x3];\ndst.dword[1] = a.dword[(imm &gt;&gt; 2) &amp; 0x3];\ndst.dword[2] = a.dword[(imm &gt;&gt; 4) &amp; 0x3];\ndst.dword[3] = a.dword[(imm &gt;&gt; 6) &amp; 0x3];\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Permutation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvpermi_q (__m256i a, __m256i b, imm0_255 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvpermi_q (__m256i a, __m256i b, imm0_255 imm)\n#include <lasxintrin.h>\nInstruction: xvpermi.d xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nPermute quad words from `a` and `b` with indices recorded in `imm` and store into `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nif ((imm & 0x4) && MACHINE_3C5000) {\n  // Caveat: observed in 3C5000\n  dst.qword[0] = 0;\n} else {\n  dst.qword[0] = (imm & 2) ? a.qword[imm & 0x1] : b.qword[imm & 0x1];\n}\nif ((imm & 0x80) && MACHINE_3C5000) {\n  // Caveat: observed in 3C5000\n  dst.qword[1] = 0;\n} else {\n  dst.qword[1] =\n      (imm & 0x20) ? a.qword[(imm >> 4) & 0x1] : b.qword[(imm >> 4) & 0x1];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 4 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvpermi_q (__m256i a, __m256i b, imm0_255 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvpermi.d xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Permute quad words from <code>a</code> and <code>b</code> with indices recorded in <code>imm</code> and store into <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">if ((imm &amp; 0x4) &amp;&amp; MACHINE_3C5000) {\n  // Caveat: observed in 3C5000\n  dst.qword[0] = 0;\n} else {\n  dst.qword[0] = (imm &amp; 2) ? a.qword[imm &amp; 0x1] : b.qword[imm &amp; 0x1];\n}\nif ((imm &amp; 0x80) &amp;&amp; MACHINE_3C5000) {\n  // Caveat: observed in 3C5000\n  dst.qword[1] = 0;\n} else {\n  dst.qword[1] =\n      (imm &amp; 0x20) ? a.qword[(imm &gt;&gt; 4) &amp; 0x1] : b.qword[(imm &gt;&gt; 4) &amp; 0x1];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Permutation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvpermi_w (__m256i a, __m256i b, imm0_255 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvpermi_w (__m256i a, __m256i b, imm0_255 imm)\n#include <lasxintrin.h>\nInstruction: xvpermi.w xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nPermute words from `a` and `b` with indices recorded in `imm` and store into `dst`.\n\n\n\n\n\n### Operation\n\n```c++\ndst.word[0] = b.word[imm & 0x3];\ndst.word[1] = b.word[(imm >> 2) & 0x3];\ndst.word[2] = a.word[(imm >> 4) & 0x3];\ndst.word[3] = a.word[(imm >> 6) & 0x3];\ndst.word[4] = b.word[4 + (imm & 0x3)];\ndst.word[5] = b.word[4 + ((imm >> 2) & 0x3)];\ndst.word[6] = a.word[4 + ((imm >> 4) & 0x3)];\ndst.word[7] = a.word[4 + ((imm >> 6) & 0x3)];\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvpermi_w (__m256i a, __m256i b, imm0_255 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvpermi.w xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Permute words from <code>a</code> and <code>b</code> with indices recorded in <code>imm</code> and store into <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">dst.word[0] = b.word[imm &amp; 0x3];\ndst.word[1] = b.word[(imm &gt;&gt; 2) &amp; 0x3];\ndst.word[2] = a.word[(imm &gt;&gt; 4) &amp; 0x3];\ndst.word[3] = a.word[(imm &gt;&gt; 6) &amp; 0x3];\ndst.word[4] = b.word[4 + (imm &amp; 0x3)];\ndst.word[5] = b.word[4 + ((imm &gt;&gt; 2) &amp; 0x3)];\ndst.word[6] = a.word[4 + ((imm &gt;&gt; 4) &amp; 0x3)];\ndst.word[7] = a.word[4 + ((imm &gt;&gt; 6) &amp; 0x3)];\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Permutation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvpickev_b (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvpickev_b (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvpickev.b xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nPick even-positioned 8-bit elements in `b` first, then pick even-positioned 8-bit elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = (i < 8) ? b.byte[i * 2] : a.byte[(i - 8) * 2];\n}\nfor (int i = 16; i < 32; i++) {\n  dst.byte[i] = (i < 24) ? b.byte[(i - 8) * 2] : a.byte[(i - 16) * 2];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvpickev_b (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvpickev.b xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Pick even-positioned 8-bit elements in <code>b</code> first, then pick even-positioned 8-bit elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = (i &lt; 8) ? b.byte[i * 2] : a.byte[(i - 8) * 2];\n}\nfor (int i = 16; i &lt; 32; i++) {\n  dst.byte[i] = (i &lt; 24) ? b.byte[(i - 8) * 2] : a.byte[(i - 16) * 2];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvpickev_d (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvpickev_d (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvpickev.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nPick even-positioned 64-bit elements in `b` first, then pick even-positioned 64-bit elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = (i < 1) ? b.dword[i * 2] : a.dword[(i - 1) * 2];\n}\nfor (int i = 2; i < 4; i++) {\n  dst.dword[i] = (i < 3) ? b.dword[(i - 1) * 2] : a.dword[(i - 2) * 2];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvpickev_d (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvpickev.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Pick even-positioned 64-bit elements in <code>b</code> first, then pick even-positioned 64-bit elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = (i &lt; 1) ? b.dword[i * 2] : a.dword[(i - 1) * 2];\n}\nfor (int i = 2; i &lt; 4; i++) {\n  dst.dword[i] = (i &lt; 3) ? b.dword[(i - 1) * 2] : a.dword[(i - 2) * 2];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvpickev_h (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvpickev_h (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvpickev.h xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nPick even-positioned 16-bit elements in `b` first, then pick even-positioned 16-bit elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = (i < 4) ? b.half[i * 2] : a.half[(i - 4) * 2];\n}\nfor (int i = 8; i < 16; i++) {\n  dst.half[i] = (i < 12) ? b.half[(i - 4) * 2] : a.half[(i - 8) * 2];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvpickev_h (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvpickev.h xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Pick even-positioned 16-bit elements in <code>b</code> first, then pick even-positioned 16-bit elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = (i &lt; 4) ? b.half[i * 2] : a.half[(i - 4) * 2];\n}\nfor (int i = 8; i &lt; 16; i++) {\n  dst.half[i] = (i &lt; 12) ? b.half[(i - 4) * 2] : a.half[(i - 8) * 2];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvpickev_w (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvpickev_w (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvpickev.w xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nPick even-positioned 32-bit elements in `b` first, then pick even-positioned 32-bit elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = (i < 2) ? b.word[i * 2] : a.word[(i - 2) * 2];\n}\nfor (int i = 4; i < 8; i++) {\n  dst.word[i] = (i < 6) ? b.word[(i - 2) * 2] : a.word[(i - 4) * 2];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvpickev_w (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvpickev.w xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Pick even-positioned 32-bit elements in <code>b</code> first, then pick even-positioned 32-bit elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = (i &lt; 2) ? b.word[i * 2] : a.word[(i - 2) * 2];\n}\nfor (int i = 4; i &lt; 8; i++) {\n  dst.word[i] = (i &lt; 6) ? b.word[(i - 2) * 2] : a.word[(i - 4) * 2];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvpickod_b (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvpickod_b (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvpickod.b xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nPick odd-positioned 8-bit elements in `b` first, then pick odd-positioned 8-bit elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = (i < 8) ? b.byte[i * 2 + 1] : a.byte[(i - 8) * 2 + 1];\n}\nfor (int i = 16; i < 32; i++) {\n  dst.byte[i] = (i < 24) ? b.byte[(i - 8) * 2 + 1] : a.byte[(i - 16) * 2 + 1];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvpickod_b (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvpickod.b xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Pick odd-positioned 8-bit elements in <code>b</code> first, then pick odd-positioned 8-bit elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = (i &lt; 8) ? b.byte[i * 2 + 1] : a.byte[(i - 8) * 2 + 1];\n}\nfor (int i = 16; i &lt; 32; i++) {\n  dst.byte[i] = (i &lt; 24) ? b.byte[(i - 8) * 2 + 1] : a.byte[(i - 16) * 2 + 1];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvpickod_d (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvpickod_d (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvpickod.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nPick odd-positioned 64-bit elements in `b` first, then pick odd-positioned 64-bit elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = (i < 1) ? b.dword[i * 2 + 1] : a.dword[(i - 1) * 2 + 1];\n}\nfor (int i = 2; i < 4; i++) {\n  dst.dword[i] = (i < 3) ? b.dword[(i - 1) * 2 + 1] : a.dword[(i - 2) * 2 + 1];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvpickod_d (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvpickod.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Pick odd-positioned 64-bit elements in <code>b</code> first, then pick odd-positioned 64-bit elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = (i &lt; 1) ? b.dword[i * 2 + 1] : a.dword[(i - 1) * 2 + 1];\n}\nfor (int i = 2; i &lt; 4; i++) {\n  dst.dword[i] = (i &lt; 3) ? b.dword[(i - 1) * 2 + 1] : a.dword[(i - 2) * 2 + 1];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvpickod_h (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvpickod_h (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvpickod.h xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nPick odd-positioned 16-bit elements in `b` first, then pick odd-positioned 16-bit elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = (i < 4) ? b.half[i * 2 + 1] : a.half[(i - 4) * 2 + 1];\n}\nfor (int i = 8; i < 16; i++) {\n  dst.half[i] = (i < 12) ? b.half[(i - 4) * 2 + 1] : a.half[(i - 8) * 2 + 1];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvpickod_h (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvpickod.h xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Pick odd-positioned 16-bit elements in <code>b</code> first, then pick odd-positioned 16-bit elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = (i &lt; 4) ? b.half[i * 2 + 1] : a.half[(i - 4) * 2 + 1];\n}\nfor (int i = 8; i &lt; 16; i++) {\n  dst.half[i] = (i &lt; 12) ? b.half[(i - 4) * 2 + 1] : a.half[(i - 8) * 2 + 1];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvpickod_w (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvpickod_w (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvpickod.w xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nPick odd-positioned 32-bit elements in `b` first, then pick odd-positioned 32-bit elements in `a`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = (i < 2) ? b.word[i * 2 + 1] : a.word[(i - 2) * 2 + 1];\n}\nfor (int i = 4; i < 8; i++) {\n  dst.word[i] = (i < 6) ? b.word[(i - 2) * 2 + 1] : a.word[(i - 4) * 2 + 1];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvpickod_w (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvpickod.w xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Pick odd-positioned 32-bit elements in <code>b</code> first, then pick odd-positioned 32-bit elements in <code>a</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = (i &lt; 2) ? b.word[i * 2 + 1] : a.word[(i - 2) * 2 + 1];\n}\nfor (int i = 4; i &lt; 8; i++) {\n  dst.word[i] = (i &lt; 6) ? b.word[(i - 2) * 2 + 1] : a.word[(i - 4) * 2 + 1];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvpickve_d (__m256i a, imm0_3 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvpickve_d (__m256i a, imm0_3 imm)\n#include <lasxintrin.h>\nInstruction: xvpickve.d xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nCopy one 64-bit lane from `a` specified by `imm` to the first lane of `dst`, and set the other lanes to zero.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = (i == 0) ? a.dword[imm] : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 4 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvpickve_d (__m256i a, imm0_3 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvpickve.d xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Copy one 64-bit lane from <code>a</code> specified by <code>imm</code> to the first lane of <code>dst</code>, and set the other lanes to zero.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = (i == 0) ? a.dword[imm] : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvpickve_w (__m256i a, imm0_7 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvpickve_w (__m256i a, imm0_7 imm)\n#include <lasxintrin.h>\nInstruction: xvpickve.w xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nCopy one 32-bit lane from `a` specified by `imm` to the first lane of `dst`, and set the other lanes to zero.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = (i == 0) ? a.word[imm] : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 4 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvpickve_w (__m256i a, imm0_7 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvpickve.w xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Copy one 32-bit lane from <code>a</code> specified by <code>imm</code> to the first lane of <code>dst</code>, and set the other lanes to zero.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = (i == 0) ? a.word[imm] : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvrepl128vei_b (__m256i a, imm0_15 idx)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvrepl128vei_b (__m256i a, imm0_15 idx)\n#include <lasxintrin.h>\nInstruction: xvrepl128vei.b xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nRepeat the element in lane `idx` of `a` to fill whole vector.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = a.byte[idx];\n}\nfor (int i = 16; i < 32; i++) {\n  dst.byte[i] = a.byte[idx + 16];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvrepl128vei_b (__m256i a, imm0_15 idx)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvrepl128vei.b xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Repeat the element in lane <code>idx</code> of <code>a</code> to fill whole vector.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = a.byte[idx];\n}\nfor (int i = 16; i &lt; 32; i++) {\n  dst.byte[i] = a.byte[idx + 16];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvrepl128vei_d (__m256i a, imm0_1 idx)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvrepl128vei_d (__m256i a, imm0_1 idx)\n#include <lasxintrin.h>\nInstruction: xvrepl128vei.d xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nRepeat the element in lane `idx` of `a` to fill whole vector.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = a.dword[idx];\n}\nfor (int i = 2; i < 4; i++) {\n  dst.dword[i] = a.dword[idx + 2];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvrepl128vei_d (__m256i a, imm0_1 idx)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvrepl128vei.d xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Repeat the element in lane <code>idx</code> of <code>a</code> to fill whole vector.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = a.dword[idx];\n}\nfor (int i = 2; i &lt; 4; i++) {\n  dst.dword[i] = a.dword[idx + 2];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvrepl128vei_h (__m256i a, imm0_7 idx)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvrepl128vei_h (__m256i a, imm0_7 idx)\n#include <lasxintrin.h>\nInstruction: xvrepl128vei.h xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nRepeat the element in lane `idx` of `a` to fill whole vector.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = a.half[idx];\n}\nfor (int i = 8; i < 16; i++) {\n  dst.half[i] = a.half[idx + 8];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvrepl128vei_h (__m256i a, imm0_7 idx)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvrepl128vei.h xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Repeat the element in lane <code>idx</code> of <code>a</code> to fill whole vector.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = a.half[idx];\n}\nfor (int i = 8; i &lt; 16; i++) {\n  dst.half[i] = a.half[idx + 8];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvrepl128vei_w (__m256i a, imm0_3 idx)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvrepl128vei_w (__m256i a, imm0_3 idx)\n#include <lasxintrin.h>\nInstruction: xvrepl128vei.w xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nRepeat the element in lane `idx` of `a` to fill whole vector.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = a.word[idx];\n}\nfor (int i = 4; i < 8; i++) {\n  dst.word[i] = a.word[idx + 4];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvrepl128vei_w (__m256i a, imm0_3 idx)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvrepl128vei.w xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Repeat the element in lane <code>idx</code> of <code>a</code> to fill whole vector.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = a.word[idx];\n}\nfor (int i = 4; i &lt; 8; i++) {\n  dst.word[i] = a.word[idx + 4];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvreplgr2vr_b (int val)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvreplgr2vr_b (int val)\n#include <lasxintrin.h>\nInstruction: xvreplgr2vr.b xr, r\nCPU Flags: LASX\n```\n\n### Description\n\nRepeat `val` to whole vector.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 32; i++) {\n  dst.byte[i] = val;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | N/A | 1 |\n| 3C5000 | N/A | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvreplgr2vr_b (int val)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvreplgr2vr.b xr, r\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Repeat <code>val</code> to whole vector.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 32; i++) {\n  dst.byte[i] = val;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>N/A</td>\n<td>1</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>N/A</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvreplgr2vr_d (long int val)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvreplgr2vr_d (long int val)\n#include <lasxintrin.h>\nInstruction: xvreplgr2vr.d xr, r\nCPU Flags: LASX\n```\n\n### Description\n\nRepeat `val` to whole vector.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = val;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | N/A | 1 |\n| 3C5000 | N/A | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvreplgr2vr_d (long int val)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvreplgr2vr.d xr, r\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Repeat <code>val</code> to whole vector.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = val;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>N/A</td>\n<td>1</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>N/A</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvreplgr2vr_h (int val)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvreplgr2vr_h (int val)\n#include <lasxintrin.h>\nInstruction: xvreplgr2vr.h xr, r\nCPU Flags: LASX\n```\n\n### Description\n\nRepeat `val` to whole vector.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = val;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | N/A | 1 |\n| 3C5000 | N/A | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvreplgr2vr_h (int val)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvreplgr2vr.h xr, r\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Repeat <code>val</code> to whole vector.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = val;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>N/A</td>\n<td>1</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>N/A</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvreplgr2vr_w (int val)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvreplgr2vr_w (int val)\n#include <lasxintrin.h>\nInstruction: xvreplgr2vr.w xr, r\nCPU Flags: LASX\n```\n\n### Description\n\nRepeat `val` to whole vector.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = val;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | N/A | 1 |\n| 3C5000 | N/A | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvreplgr2vr_w (int val)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvreplgr2vr.w xr, r\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Repeat <code>val</code> to whole vector.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = val;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>N/A</td>\n<td>1</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>N/A</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvrepli_b (imm_n512_511 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvrepli_b (imm_n512_511 imm)\n#include <lasxintrin.h>\nInstruction: xvldi xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nRepeat `imm` to fill whole vector.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 32; i++) {\n  dst.byte[i] = imm;\n}\n```\n\nTested on real machine.", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvrepli_b (imm_n512_511 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvldi xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Repeat <code>imm</code> to fill whole vector.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 32; i++) {\n  dst.byte[i] = imm;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvrepli_d (imm_n512_511 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvrepli_d (imm_n512_511 imm)\n#include <lasxintrin.h>\nInstruction: xvldi xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nRepeat `imm` to fill whole vector.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = imm;\n}\n```\n\nTested on real machine.", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvrepli_d (imm_n512_511 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvldi xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Repeat <code>imm</code> to fill whole vector.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = imm;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvrepli_h (imm_n512_511 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvrepli_h (imm_n512_511 imm)\n#include <lasxintrin.h>\nInstruction: xvldi xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nRepeat `imm` to fill whole vector.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = imm;\n}\n```\n\nTested on real machine.", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvrepli_h (imm_n512_511 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvldi xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Repeat <code>imm</code> to fill whole vector.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = imm;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvrepli_w (imm_n512_511 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvrepli_w (imm_n512_511 imm)\n#include <lasxintrin.h>\nInstruction: xvldi xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nRepeat `imm` to fill whole vector.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = imm;\n}\n```\n\nTested on real machine.", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvrepli_w (imm_n512_511 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvldi xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Repeat <code>imm</code> to fill whole vector.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = imm;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvreplve0_b (__m256i a)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvreplve0_b (__m256i a)\n#include <lasxintrin.h>\nInstruction: xvreplve0.b xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nRepeat the first 8-bit lane from `a` to all lanes of `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 32; i++) {\n  dst.byte[i] = a.byte[0];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 4 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvreplve0_b (__m256i a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvreplve0.b xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Repeat the first 8-bit lane from <code>a</code> to all lanes of <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 32; i++) {\n  dst.byte[i] = a.byte[0];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvreplve0_d (__m256i a)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvreplve0_d (__m256i a)\n#include <lasxintrin.h>\nInstruction: xvreplve0.d xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nRepeat the first 64-bit lane from `a` to all lanes of `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = a.dword[0];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 4 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvreplve0_d (__m256i a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvreplve0.d xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Repeat the first 64-bit lane from <code>a</code> to all lanes of <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = a.dword[0];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvreplve0_h (__m256i a)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvreplve0_h (__m256i a)\n#include <lasxintrin.h>\nInstruction: xvreplve0.h xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nRepeat the first 16-bit lane from `a` to all lanes of `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = a.half[0];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 4 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvreplve0_h (__m256i a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvreplve0.h xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Repeat the first 16-bit lane from <code>a</code> to all lanes of <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = a.half[0];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvreplve0_q (__m256i a)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvreplve0_q (__m256i a)\n#include <lasxintrin.h>\nInstruction: xvreplve0.q xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nRepeat the first 128-bit lane from `a` to all lanes of `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.qword[i] = a.qword[0];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 4 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvreplve0_q (__m256i a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvreplve0.q xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Repeat the first 128-bit lane from <code>a</code> to all lanes of <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.qword[i] = a.qword[0];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvreplve0_w (__m256i a)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvreplve0_w (__m256i a)\n#include <lasxintrin.h>\nInstruction: xvreplve0.w xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nRepeat the first 32-bit lane from `a` to all lanes of `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = a.word[0];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 4 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvreplve0_w (__m256i a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvreplve0.w xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Repeat the first 32-bit lane from <code>a</code> to all lanes of <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = a.word[0];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvreplve_b (__m256i a, int idx)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvreplve_b (__m256i a, int idx)\n#include <lasxintrin.h>\nInstruction: xvreplve.b xr, xr, r\nCPU Flags: LASX\n```\n\n### Description\n\nRepeat the element in lane `idx` of `a` to fill whole vector.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = a.byte[idx % 16];\n}\nfor (int i = 16; i < 32; i++) {\n  dst.byte[i] = a.byte[(idx % 16) + 16];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 1 |\n| 3C5000 | 1 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvreplve_b (__m256i a, int idx)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvreplve.b xr, xr, r\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Repeat the element in lane <code>idx</code> of <code>a</code> to fill whole vector.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = a.byte[idx % 16];\n}\nfor (int i = 16; i &lt; 32; i++) {\n  dst.byte[i] = a.byte[(idx % 16) + 16];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>1</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvreplve_d (__m256i a, int idx)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvreplve_d (__m256i a, int idx)\n#include <lasxintrin.h>\nInstruction: xvreplve.d xr, xr, r\nCPU Flags: LASX\n```\n\n### Description\n\nRepeat the element in lane `idx` of `a` to fill whole vector.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = a.dword[idx % 2];\n}\nfor (int i = 2; i < 4; i++) {\n  dst.dword[i] = a.dword[(idx % 2) + 2];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 1 |\n| 3C5000 | 1 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvreplve_d (__m256i a, int idx)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvreplve.d xr, xr, r\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Repeat the element in lane <code>idx</code> of <code>a</code> to fill whole vector.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = a.dword[idx % 2];\n}\nfor (int i = 2; i &lt; 4; i++) {\n  dst.dword[i] = a.dword[(idx % 2) + 2];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>1</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvreplve_h (__m256i a, int idx)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvreplve_h (__m256i a, int idx)\n#include <lasxintrin.h>\nInstruction: xvreplve.h xr, xr, r\nCPU Flags: LASX\n```\n\n### Description\n\nRepeat the element in lane `idx` of `a` to fill whole vector.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = a.half[idx % 8];\n}\nfor (int i = 8; i < 16; i++) {\n  dst.half[i] = a.half[(idx % 8) + 8];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 1 |\n| 3C5000 | 1 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvreplve_h (__m256i a, int idx)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvreplve.h xr, xr, r\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Repeat the element in lane <code>idx</code> of <code>a</code> to fill whole vector.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = a.half[idx % 8];\n}\nfor (int i = 8; i &lt; 16; i++) {\n  dst.half[i] = a.half[(idx % 8) + 8];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>1</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvreplve_w (__m256i a, int idx)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvreplve_w (__m256i a, int idx)\n#include <lasxintrin.h>\nInstruction: xvreplve.w xr, xr, r\nCPU Flags: LASX\n```\n\n### Description\n\nRepeat the element in lane `idx` of `a` to fill whole vector.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = a.word[idx % 4];\n}\nfor (int i = 4; i < 8; i++) {\n  dst.word[i] = a.word[(idx % 4) + 4];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 1 |\n| 3C5000 | 1 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvreplve_w (__m256i a, int idx)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvreplve.w xr, xr, r\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Repeat the element in lane <code>idx</code> of <code>a</code> to fill whole vector.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = a.word[idx % 4];\n}\nfor (int i = 4; i &lt; 8; i++) {\n  dst.word[i] = a.word[(idx % 4) + 4];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>1</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvrotr_b (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvrotr_b (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvrotr.b xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nRotate right the unsigned 8-bit elements in `a` by elements in `b`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 32; i++) {\n  dst.byte[i] =\n      (a.byte[i] >> (b.byte[i] & 0x7)) | (a.byte[i] << (8 - (b.byte[i] & 0x7)));\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvrotr_b (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvrotr.b xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Rotate right the unsigned 8-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 32; i++) {\n  dst.byte[i] =\n      (a.byte[i] &gt;&gt; (b.byte[i] &amp; 0x7)) | (a.byte[i] &lt;&lt; (8 - (b.byte[i] &amp; 0x7)));\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvrotr_d (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvrotr_d (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvrotr.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nRotate right the unsigned 64-bit elements in `a` by elements in `b`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = (a.dword[i] >> (b.dword[i] & 0x3f)) |\n                 (a.dword[i] << (64 - (b.dword[i] & 0x3f)));\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvrotr_d (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvrotr.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Rotate right the unsigned 64-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = (a.dword[i] &gt;&gt; (b.dword[i] &amp; 0x3f)) |\n                 (a.dword[i] &lt;&lt; (64 - (b.dword[i] &amp; 0x3f)));\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvrotr_h (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvrotr_h (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvrotr.h xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nRotate right the unsigned 16-bit elements in `a` by elements in `b`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = (a.half[i] >> (b.half[i] & 0xf)) |\n                (a.half[i] << (16 - (b.half[i] & 0xf)));\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvrotr_h (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvrotr.h xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Rotate right the unsigned 16-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = (a.half[i] &gt;&gt; (b.half[i] &amp; 0xf)) |\n                (a.half[i] &lt;&lt; (16 - (b.half[i] &amp; 0xf)));\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvrotr_w (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvrotr_w (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvrotr.w xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nRotate right the unsigned 32-bit elements in `a` by elements in `b`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = (a.word[i] >> (b.word[i] & 0x1f)) |\n                (a.word[i] << (32 - (b.word[i] & 0x1f)));\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvrotr_w (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvrotr.w xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Rotate right the unsigned 32-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = (a.word[i] &gt;&gt; (b.word[i] &amp; 0x1f)) |\n                (a.word[i] &lt;&lt; (32 - (b.word[i] &amp; 0x1f)));\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvrotri_b (__m256i a, imm0_7 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvrotri_b (__m256i a, imm0_7 imm)\n#include <lasxintrin.h>\nInstruction: xvrotri.b xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nRotate right the unsigned 8-bit elements in `a` by `imm`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 32; i++) {\n  dst.byte[i] = (a.byte[i] >> imm) | (a.byte[i] << (8 - imm));\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvrotri_b (__m256i a, imm0_7 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvrotri.b xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Rotate right the unsigned 8-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 32; i++) {\n  dst.byte[i] = (a.byte[i] &gt;&gt; imm) | (a.byte[i] &lt;&lt; (8 - imm));\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvrotri_d (__m256i a, imm0_63 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvrotri_d (__m256i a, imm0_63 imm)\n#include <lasxintrin.h>\nInstruction: xvrotri.d xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nRotate right the unsigned 64-bit elements in `a` by `imm`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = (a.dword[i] >> imm) | (a.dword[i] << (64 - imm));\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvrotri_d (__m256i a, imm0_63 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvrotri.d xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Rotate right the unsigned 64-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = (a.dword[i] &gt;&gt; imm) | (a.dword[i] &lt;&lt; (64 - imm));\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvrotri_h (__m256i a, imm0_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvrotri_h (__m256i a, imm0_15 imm)\n#include <lasxintrin.h>\nInstruction: xvrotri.h xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nRotate right the unsigned 16-bit elements in `a` by `imm`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = (a.half[i] >> imm) | (a.half[i] << (16 - imm));\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvrotri_h (__m256i a, imm0_15 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvrotri.h xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Rotate right the unsigned 16-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = (a.half[i] &gt;&gt; imm) | (a.half[i] &lt;&lt; (16 - imm));\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvrotri_w (__m256i a, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvrotri_w (__m256i a, imm0_31 imm)\n#include <lasxintrin.h>\nInstruction: xvrotri.w xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nRotate right the unsigned 32-bit elements in `a` by `imm`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = (a.word[i] >> imm) | (a.word[i] << (32 - imm));\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvrotri_w (__m256i a, imm0_31 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvrotri.w xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Rotate right the unsigned 32-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = (a.word[i] &gt;&gt; imm) | (a.word[i] &lt;&lt; (32 - imm));\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsadd_b (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsadd_b (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvsadd.b xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nSaturing add the signed 8-bit elements in `a` and `b`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 32; i++) {\n  dst.byte[i] = (s8)sadd((s8)a.byte[i], (s8)b.byte[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsadd_b (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsadd.b xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Saturing add the signed 8-bit elements in <code>a</code> and <code>b</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 32; i++) {\n  dst.byte[i] = (s8)sadd((s8)a.byte[i], (s8)b.byte[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsadd_bu (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsadd_bu (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvsadd.bu xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nSaturing add the unsigned 8-bit elements in `a` and `b`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 32; i++) {\n  dst.byte[i] = (u8)sadd((u8)a.byte[i], (u8)b.byte[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsadd_bu (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsadd.bu xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Saturing add the unsigned 8-bit elements in <code>a</code> and <code>b</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 32; i++) {\n  dst.byte[i] = (u8)sadd((u8)a.byte[i], (u8)b.byte[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsadd_d (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsadd_d (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvsadd.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nSaturing add the signed 64-bit elements in `a` and `b`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = (s64)sadd((s64)a.dword[i], (s64)b.dword[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsadd_d (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsadd.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Saturing add the signed 64-bit elements in <code>a</code> and <code>b</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = (s64)sadd((s64)a.dword[i], (s64)b.dword[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsadd_du (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsadd_du (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvsadd.du xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nSaturing add the unsigned 64-bit elements in `a` and `b`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = (u64)sadd((u64)a.dword[i], (u64)b.dword[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsadd_du (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsadd.du xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Saturing add the unsigned 64-bit elements in <code>a</code> and <code>b</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = (u64)sadd((u64)a.dword[i], (u64)b.dword[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsadd_h (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsadd_h (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvsadd.h xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nSaturing add the signed 16-bit elements in `a` and `b`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = (s16)sadd((s16)a.half[i], (s16)b.half[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsadd_h (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsadd.h xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Saturing add the signed 16-bit elements in <code>a</code> and <code>b</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = (s16)sadd((s16)a.half[i], (s16)b.half[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsadd_hu (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsadd_hu (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvsadd.hu xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nSaturing add the unsigned 16-bit elements in `a` and `b`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = (u16)sadd((u16)a.half[i], (u16)b.half[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsadd_hu (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsadd.hu xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Saturing add the unsigned 16-bit elements in <code>a</code> and <code>b</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = (u16)sadd((u16)a.half[i], (u16)b.half[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsadd_w (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsadd_w (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvsadd.w xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nSaturing add the signed 32-bit elements in `a` and `b`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = (s32)sadd((s32)a.word[i], (s32)b.word[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsadd_w (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsadd.w xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Saturing add the signed 32-bit elements in <code>a</code> and <code>b</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = (s32)sadd((s32)a.word[i], (s32)b.word[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsadd_wu (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsadd_wu (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvsadd.wu xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nSaturing add the unsigned 32-bit elements in `a` and `b`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = (u32)sadd((u32)a.word[i], (u32)b.word[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsadd_wu (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsadd.wu xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Saturing add the unsigned 32-bit elements in <code>a</code> and <code>b</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = (u32)sadd((u32)a.word[i], (u32)b.word[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsat_b (__m256i a, imm0_7 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsat_b (__m256i a, imm0_7 imm)\n#include <lasxintrin.h>\nInstruction: xvsat.b xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nClamp signed 8-bit elements in `a` to range specified by `imm`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 32; i++) {\n  dst.byte[i] = clamp<s8>(a.byte[i], -(1 << imm), (1 << imm) - 1);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsat_b (__m256i a, imm0_7 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsat.b xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Clamp signed 8-bit elements in <code>a</code> to range specified by <code>imm</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 32; i++) {\n  dst.byte[i] = clamp&lt;s8&gt;(a.byte[i], -(1 &lt;&lt; imm), (1 &lt;&lt; imm) - 1);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsat_bu (__m256i a, imm0_7 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsat_bu (__m256i a, imm0_7 imm)\n#include <lasxintrin.h>\nInstruction: xvsat.bu xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nClamp unsigned 8-bit elements in `a` to range specified by `imm`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 32; i++) {\n  dst.byte[i] = clamp<u8>(a.byte[i], 0, (1 << (imm + 1)) - 1);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsat_bu (__m256i a, imm0_7 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsat.bu xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Clamp unsigned 8-bit elements in <code>a</code> to range specified by <code>imm</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 32; i++) {\n  dst.byte[i] = clamp&lt;u8&gt;(a.byte[i], 0, (1 &lt;&lt; (imm + 1)) - 1);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsat_d (__m256i a, imm0_63 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsat_d (__m256i a, imm0_63 imm)\n#include <lasxintrin.h>\nInstruction: xvsat.d xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nClamp signed 64-bit elements in `a` to range specified by `imm`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = clamp<s64>(a.dword[i], -(1 << imm), (1 << imm) - 1);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsat_d (__m256i a, imm0_63 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsat.d xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Clamp signed 64-bit elements in <code>a</code> to range specified by <code>imm</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = clamp&lt;s64&gt;(a.dword[i], -(1 &lt;&lt; imm), (1 &lt;&lt; imm) - 1);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsat_du (__m256i a, imm0_63 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsat_du (__m256i a, imm0_63 imm)\n#include <lasxintrin.h>\nInstruction: xvsat.du xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nClamp unsigned 64-bit elements in `a` to range specified by `imm`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = clamp<u64>(a.dword[i], 0, (1 << (imm + 1)) - 1);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsat_du (__m256i a, imm0_63 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsat.du xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Clamp unsigned 64-bit elements in <code>a</code> to range specified by <code>imm</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = clamp&lt;u64&gt;(a.dword[i], 0, (1 &lt;&lt; (imm + 1)) - 1);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsat_h (__m256i a, imm0_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsat_h (__m256i a, imm0_15 imm)\n#include <lasxintrin.h>\nInstruction: xvsat.h xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nClamp signed 16-bit elements in `a` to range specified by `imm`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = clamp<s16>(a.half[i], -(1 << imm), (1 << imm) - 1);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsat_h (__m256i a, imm0_15 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsat.h xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Clamp signed 16-bit elements in <code>a</code> to range specified by <code>imm</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = clamp&lt;s16&gt;(a.half[i], -(1 &lt;&lt; imm), (1 &lt;&lt; imm) - 1);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsat_hu (__m256i a, imm0_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsat_hu (__m256i a, imm0_15 imm)\n#include <lasxintrin.h>\nInstruction: xvsat.hu xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nClamp unsigned 16-bit elements in `a` to range specified by `imm`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = clamp<u16>(a.half[i], 0, (1 << (imm + 1)) - 1);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsat_hu (__m256i a, imm0_15 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsat.hu xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Clamp unsigned 16-bit elements in <code>a</code> to range specified by <code>imm</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = clamp&lt;u16&gt;(a.half[i], 0, (1 &lt;&lt; (imm + 1)) - 1);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsat_w (__m256i a, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsat_w (__m256i a, imm0_31 imm)\n#include <lasxintrin.h>\nInstruction: xvsat.w xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nClamp signed 32-bit elements in `a` to range specified by `imm`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = clamp<s32>(a.word[i], -(1 << imm), (1 << imm) - 1);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsat_w (__m256i a, imm0_31 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsat.w xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Clamp signed 32-bit elements in <code>a</code> to range specified by <code>imm</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = clamp&lt;s32&gt;(a.word[i], -(1 &lt;&lt; imm), (1 &lt;&lt; imm) - 1);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsat_wu (__m256i a, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsat_wu (__m256i a, imm0_31 imm)\n#include <lasxintrin.h>\nInstruction: xvsat.wu xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nClamp unsigned 32-bit elements in `a` to range specified by `imm`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = clamp<u32>(a.word[i], 0, (1 << (imm + 1)) - 1);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsat_wu (__m256i a, imm0_31 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsat.wu xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Clamp unsigned 32-bit elements in <code>a</code> to range specified by <code>imm</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = clamp&lt;u32&gt;(a.word[i], 0, (1 &lt;&lt; (imm + 1)) - 1);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvseq_b (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvseq_b (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvseq.b xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompare the 8-bit elements in `a` and `b`, store all-ones to `dst` if equal, zero otherwise.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 32; i++) {\n  dst.byte[i] = (a.byte[i] == b.byte[i]) ? 0xFF : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvseq_b (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvseq.b xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare the 8-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if equal, zero otherwise.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 32; i++) {\n  dst.byte[i] = (a.byte[i] == b.byte[i]) ? 0xFF : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Comparison", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvseq_d (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvseq_d (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvseq.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompare the 64-bit elements in `a` and `b`, store all-ones to `dst` if equal, zero otherwise.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = (a.dword[i] == b.dword[i]) ? 0xFFFFFFFFFFFFFFFF : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvseq_d (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvseq.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare the 64-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if equal, zero otherwise.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = (a.dword[i] == b.dword[i]) ? 0xFFFFFFFFFFFFFFFF : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Comparison", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvseq_h (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvseq_h (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvseq.h xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompare the 16-bit elements in `a` and `b`, store all-ones to `dst` if equal, zero otherwise.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = (a.half[i] == b.half[i]) ? 0xFFFF : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvseq_h (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvseq.h xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare the 16-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if equal, zero otherwise.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = (a.half[i] == b.half[i]) ? 0xFFFF : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Comparison", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvseq_w (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvseq_w (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvseq.w xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompare the 32-bit elements in `a` and `b`, store all-ones to `dst` if equal, zero otherwise.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = (a.word[i] == b.word[i]) ? 0xFFFFFFFF : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvseq_w (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvseq.w xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare the 32-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if equal, zero otherwise.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = (a.word[i] == b.word[i]) ? 0xFFFFFFFF : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Comparison", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvseqi_b (__m256i a, imm_n16_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvseqi_b (__m256i a, imm_n16_15 imm)\n#include <lasxintrin.h>\nInstruction: xvseqi.b xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nCompare the 8-bit elements in `a` and `imm`, store all-ones to `dst` if equal, zero otherwise.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 32; i++) {\n  dst.byte[i] = ((s8)a.byte[i] == imm) ? 0xFF : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvseqi_b (__m256i a, imm_n16_15 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvseqi.b xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare the 8-bit elements in <code>a</code> and <code>imm</code>, store all-ones to <code>dst</code> if equal, zero otherwise.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 32; i++) {\n  dst.byte[i] = ((s8)a.byte[i] == imm) ? 0xFF : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Comparison", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvseqi_d (__m256i a, imm_n16_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvseqi_d (__m256i a, imm_n16_15 imm)\n#include <lasxintrin.h>\nInstruction: xvseqi.d xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nCompare the 64-bit elements in `a` and `imm`, store all-ones to `dst` if equal, zero otherwise.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = ((s64)a.dword[i] == imm) ? 0xFFFFFFFFFFFFFFFF : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvseqi_d (__m256i a, imm_n16_15 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvseqi.d xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare the 64-bit elements in <code>a</code> and <code>imm</code>, store all-ones to <code>dst</code> if equal, zero otherwise.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = ((s64)a.dword[i] == imm) ? 0xFFFFFFFFFFFFFFFF : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Comparison", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvseqi_h (__m256i a, imm_n16_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvseqi_h (__m256i a, imm_n16_15 imm)\n#include <lasxintrin.h>\nInstruction: xvseqi.h xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nCompare the 16-bit elements in `a` and `imm`, store all-ones to `dst` if equal, zero otherwise.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = ((s16)a.half[i] == imm) ? 0xFFFF : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvseqi_h (__m256i a, imm_n16_15 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvseqi.h xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare the 16-bit elements in <code>a</code> and <code>imm</code>, store all-ones to <code>dst</code> if equal, zero otherwise.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = ((s16)a.half[i] == imm) ? 0xFFFF : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Comparison", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvseqi_w (__m256i a, imm_n16_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvseqi_w (__m256i a, imm_n16_15 imm)\n#include <lasxintrin.h>\nInstruction: xvseqi.w xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nCompare the 32-bit elements in `a` and `imm`, store all-ones to `dst` if equal, zero otherwise.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = ((s32)a.word[i] == imm) ? 0xFFFFFFFF : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvseqi_w (__m256i a, imm_n16_15 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvseqi.w xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare the 32-bit elements in <code>a</code> and <code>imm</code>, store all-ones to <code>dst</code> if equal, zero otherwise.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = ((s32)a.word[i] == imm) ? 0xFFFFFFFF : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Comparison", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvshuf4i_b (__m256i a, imm0_255 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvshuf4i_b (__m256i a, imm0_255 imm)\n#include <lasxintrin.h>\nInstruction: xvshuf4i.b xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nShuffle every four 8-bit elements in `a` with indices packed in `imm`, save the result to `dst`.\n\n![](../diagram/xvshuf4i_b.svg)\n\n\n### Examples\n\n```c++\n__m256i __lasx_xvshuf4i_b( __m256i{ 0xabcdef1314156678, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678}, 0x12)\n= 0x13ef13cd78667815 0x3412343421432121 0x3412343421432121 0x7856787878567878\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 32; i++) {\n  dst.byte[i] = a.byte[(i & ~0x3) + ((imm >> (2 * (i & 0x3))) & 0x3)];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvshuf4i_b (__m256i a, imm0_255 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvshuf4i.b xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Shuffle every four 8-bit elements in <code>a</code> with indices packed in <code>imm</code>, save the result to <code>dst</code>.</p>\n<p><img alt=\"\" src=\"../diagram/xvshuf4i_b.svg\" /></p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvshuf4i_b( __m256i{ 0xabcdef1314156678, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678}, 0x12)\n= 0x13ef13cd78667815 0x3412343421432121 0x3412343421432121 0x7856787878567878\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 32; i++) {\n  dst.byte[i] = a.byte[(i &amp; ~0x3) + ((imm &gt;&gt; (2 * (i &amp; 0x3))) &amp; 0x3)];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shuffling", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvshuf4i_d (__m256i a, __m256i b, imm0_255 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvshuf4i_d (__m256i a, __m256i b, imm0_255 imm)\n#include <lasxintrin.h>\nInstruction: xvshuf4i.d xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nShuffle every four 64-bit elements in `a` and `b` with indices packed in `imm`, save the result to `dst`.\n\n![](../diagram/xvshuf4i_d.svg)\n\n\n### Examples\n\n```c++\n__m256i __lasx_xvshuf4i_d( __m256i{ 0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{ 0xabcdef1314156678, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678}, 0x12)\n= 0xabcdef1314156678 0x1122334455667788 0x1234123443214321 0xabcdef1212341234\n```\n\n\n### Operation\n\n```c++\ndst.dword[0] = (imm & 2) ? b.dword[(imm & 1)] : a.dword[(imm & 1)];\ndst.dword[1] =\n    (imm & 8) ? b.dword[((imm >> 2) & 1)] : a.dword[((imm >> 2) & 1)];\ndst.dword[2] = (imm & 2) ? b.dword[(imm & 1) + 2] : a.dword[(imm & 1) + 2];\ndst.dword[3] =\n    (imm & 8) ? b.dword[((imm >> 2) & 1) + 2] : a.dword[((imm >> 2) & 1) + 2];\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvshuf4i_d (__m256i a, __m256i b, imm0_255 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvshuf4i.d xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Shuffle every four 64-bit elements in <code>a</code> and <code>b</code> with indices packed in <code>imm</code>, save the result to <code>dst</code>.</p>\n<p><img alt=\"\" src=\"../diagram/xvshuf4i_d.svg\" /></p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvshuf4i_d( __m256i{ 0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{ 0xabcdef1314156678, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678}, 0x12)\n= 0xabcdef1314156678 0x1122334455667788 0x1234123443214321 0xabcdef1212341234\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">dst.dword[0] = (imm &amp; 2) ? b.dword[(imm &amp; 1)] : a.dword[(imm &amp; 1)];\ndst.dword[1] =\n    (imm &amp; 8) ? b.dword[((imm &gt;&gt; 2) &amp; 1)] : a.dword[((imm &gt;&gt; 2) &amp; 1)];\ndst.dword[2] = (imm &amp; 2) ? b.dword[(imm &amp; 1) + 2] : a.dword[(imm &amp; 1) + 2];\ndst.dword[3] =\n    (imm &amp; 8) ? b.dword[((imm &gt;&gt; 2) &amp; 1) + 2] : a.dword[((imm &gt;&gt; 2) &amp; 1) + 2];\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shuffling", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvshuf4i_h (__m256i a, imm0_255 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvshuf4i_h (__m256i a, imm0_255 imm)\n#include <lasxintrin.h>\nInstruction: xvshuf4i.h xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nShuffle every four 16-bit elements in `a` with indices packed in `imm`, save the result to `dst`.\n\n![](../diagram/xvshuf4i_h.svg)\n\n\n### Examples\n\n```c++\n__m256i __lasx_xvshuf4i_h( __m256i{ 0xabcdef1314156678, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678}, 0x12)\n= 0x667814156678ef13 0x4321432143211234 0x4321432143211234 0x5678567856785678\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = a.half[(i & ~0x3) + ((imm >> (2 * (i & 0x3))) & 0x3)];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvshuf4i_h (__m256i a, imm0_255 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvshuf4i.h xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Shuffle every four 16-bit elements in <code>a</code> with indices packed in <code>imm</code>, save the result to <code>dst</code>.</p>\n<p><img alt=\"\" src=\"../diagram/xvshuf4i_h.svg\" /></p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvshuf4i_h( __m256i{ 0xabcdef1314156678, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678}, 0x12)\n= 0x667814156678ef13 0x4321432143211234 0x4321432143211234 0x5678567856785678\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = a.half[(i &amp; ~0x3) + ((imm &gt;&gt; (2 * (i &amp; 0x3))) &amp; 0x3)];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shuffling", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvshuf4i_w (__m256i a, imm0_255 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvshuf4i_w (__m256i a, imm0_255 imm)\n#include <lasxintrin.h>\nInstruction: xvshuf4i.w xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nShuffle every four 32-bit elements in `a` with indices packed in `imm`, save the result to `dst`.\n\n![](../diagram/xvshuf4i_w.svg)\n\n\n### Examples\n\n```c++\n__m256i __lasx_xvshuf4i_w( __m256i{ 0xabcdef1314156678, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678}, 0x12)\n= 0x1415667843214321 0x14156678abcdef13 0x4321432156785678 0x4321432112341234\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = a.word[(i & ~0x3) + ((imm >> (2 * (i & 0x3))) & 0x3)];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvshuf4i_w (__m256i a, imm0_255 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvshuf4i.w xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Shuffle every four 32-bit elements in <code>a</code> with indices packed in <code>imm</code>, save the result to <code>dst</code>.</p>\n<p><img alt=\"\" src=\"../diagram/xvshuf4i_w.svg\" /></p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvshuf4i_w( __m256i{ 0xabcdef1314156678, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678}, 0x12)\n= 0x1415667843214321 0x14156678abcdef13 0x4321432156785678 0x4321432112341234\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = a.word[(i &amp; ~0x3) + ((imm &gt;&gt; (2 * (i &amp; 0x3))) &amp; 0x3)];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shuffling", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvshuf_b (__m256i a, __m256i b, __m256i c)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvshuf_b (__m256i a, __m256i b, __m256i c)\n#include <lasxintrin.h>\nInstruction: xvshuf.b xr, xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\n\nShuffle bytes from `a` and `b` with indices from `c`.\n\nCaveat: the indices are placed in `c`, while in other `vshuf` intrinsics, they are placed in `a`.\n\n\n![](../diagram/xvshuf_b.svg)\n\n\n### Examples\n\n```c++\n__m256i __lasx_xvshuf_b(__m256i{0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{0xabcdef1314156678, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678}, __m256i{0x1f1f00001a0a1b0b, 0x1111120213031404, 0x0102030405060708, 0x1112131405060708})\n= 0x99997878ee21dd43 0x7777661555144413 0x4321433412341278 0x1234121212341278\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 32; i++) {\n  if ((c.byte[i] % 256) >= 64 && MACHINE_3C5000) {\n    // Caveat: observed in 3C5000\n    dst.byte[i] = 0;\n  } else if ((c.byte[i] % 32) < 16) {\n    dst.byte[i] = b.byte[(c.byte[i] % 32) + ((i >= 16) ? 16 : 0)];\n  } else {\n    dst.byte[i] = a.byte[(c.byte[i] % 32) + ((i >= 16) ? 0 : -16)];\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 2 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvshuf_b (__m256i a, __m256i b, __m256i c)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvshuf.b xr, xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Shuffle bytes from <code>a</code> and <code>b</code> with indices from <code>c</code>.</p>\n<p>Caveat: the indices are placed in <code>c</code>, while in other <code>vshuf</code> intrinsics, they are placed in <code>a</code>.</p>\n<p><img alt=\"\" src=\"../diagram/xvshuf_b.svg\" /></p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvshuf_b(__m256i{0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{0xabcdef1314156678, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678}, __m256i{0x1f1f00001a0a1b0b, 0x1111120213031404, 0x0102030405060708, 0x1112131405060708})\n= 0x99997878ee21dd43 0x7777661555144413 0x4321433412341278 0x1234121212341278\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 32; i++) {\n  if ((c.byte[i] % 256) &gt;= 64 &amp;&amp; MACHINE_3C5000) {\n    // Caveat: observed in 3C5000\n    dst.byte[i] = 0;\n  } else if ((c.byte[i] % 32) &lt; 16) {\n    dst.byte[i] = b.byte[(c.byte[i] % 32) + ((i &gt;= 16) ? 16 : 0)];\n  } else {\n    dst.byte[i] = a.byte[(c.byte[i] % 32) + ((i &gt;= 16) ? 0 : -16)];\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shuffling", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvshuf_d (__m256i a, __m256i b, __m256i c)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvshuf_d (__m256i a, __m256i b, __m256i c)\n#include <lasxintrin.h>\nInstruction: xvshuf.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nShuffle 64-bit elements in `b` and `c` with indices from `a`, save the result to `dst`.\n\n![](../diagram/xvshuf_d.svg)\n\n\n### Examples\n\n```c++\n__m256i __lasx_xvshuf_d(__m256i{0x0000000000000000, 0x0000000000000003, 0x0000000000000002, 0x0000000000000001}, __m256i{0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{0xabcdef1314156678, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678})\n= 0xabcdef1314156678 0x99aabbccddeeff00 0xabcdef1212341234 0x5678567856785678\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if ((a.dword[i] % 256) >= 64 && MACHINE_3C5000) {\n    // Caveat: observed in 3C5000\n    dst.dword[i] = 0;\n  } else if ((a.dword[i] % 4) < 2) {\n    dst.dword[i] = c.dword[(a.dword[i] % 4) + ((i >= 2) ? 2 : 0)];\n  } else {\n    dst.dword[i] = b.dword[(a.dword[i] % 4) + ((i >= 2) ? 0 : -2)];\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 2 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvshuf_d (__m256i a, __m256i b, __m256i c)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvshuf.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Shuffle 64-bit elements in <code>b</code> and <code>c</code> with indices from <code>a</code>, save the result to <code>dst</code>.</p>\n<p><img alt=\"\" src=\"../diagram/xvshuf_d.svg\" /></p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvshuf_d(__m256i{0x0000000000000000, 0x0000000000000003, 0x0000000000000002, 0x0000000000000001}, __m256i{0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{0xabcdef1314156678, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678})\n= 0xabcdef1314156678 0x99aabbccddeeff00 0xabcdef1212341234 0x5678567856785678\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if ((a.dword[i] % 256) &gt;= 64 &amp;&amp; MACHINE_3C5000) {\n    // Caveat: observed in 3C5000\n    dst.dword[i] = 0;\n  } else if ((a.dword[i] % 4) &lt; 2) {\n    dst.dword[i] = c.dword[(a.dword[i] % 4) + ((i &gt;= 2) ? 2 : 0)];\n  } else {\n    dst.dword[i] = b.dword[(a.dword[i] % 4) + ((i &gt;= 2) ? 0 : -2)];\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shuffling", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvshuf_h (__m256i a, __m256i b, __m256i c)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvshuf_h (__m256i a, __m256i b, __m256i c)\n#include <lasxintrin.h>\nInstruction: xvshuf.h xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nShuffle 16-bit elements in `b` and `c` with indices from `a`, save the result to `dst`.\n\n![](../diagram/xvshuf_h.svg)\n\n\n### Examples\n\n```c++\n__m256i __lasx_xvshuf_h(__m256i{0x0001000200030004, 0x0005000a000b000c, 0x000f000e00010002, 0x0008000900020001}, __m256i{0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{0xabcdef1314156678, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678})\n= 0x1415ef13abcd4321 0x432133441122ff00 0xaabbaabb43211234 0x1234123412344321\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  if ((a.half[i] % 256) >= 64 && MACHINE_3C5000) {\n    // Caveat: observed in 3C5000\n    dst.half[i] = 0;\n  } else if ((a.half[i] % 16) < 8) {\n    dst.half[i] = c.half[(a.half[i] % 16) + ((i >= 8) ? 8 : 0)];\n  } else {\n    dst.half[i] = b.half[(a.half[i] % 16) + ((i >= 8) ? 0 : -8)];\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 2 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvshuf_h (__m256i a, __m256i b, __m256i c)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvshuf.h xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Shuffle 16-bit elements in <code>b</code> and <code>c</code> with indices from <code>a</code>, save the result to <code>dst</code>.</p>\n<p><img alt=\"\" src=\"../diagram/xvshuf_h.svg\" /></p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvshuf_h(__m256i{0x0001000200030004, 0x0005000a000b000c, 0x000f000e00010002, 0x0008000900020001}, __m256i{0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{0xabcdef1314156678, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678})\n= 0x1415ef13abcd4321 0x432133441122ff00 0xaabbaabb43211234 0x1234123412344321\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  if ((a.half[i] % 256) &gt;= 64 &amp;&amp; MACHINE_3C5000) {\n    // Caveat: observed in 3C5000\n    dst.half[i] = 0;\n  } else if ((a.half[i] % 16) &lt; 8) {\n    dst.half[i] = c.half[(a.half[i] % 16) + ((i &gt;= 8) ? 8 : 0)];\n  } else {\n    dst.half[i] = b.half[(a.half[i] % 16) + ((i &gt;= 8) ? 0 : -8)];\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shuffling", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvshuf_w (__m256i a, __m256i b, __m256i c)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvshuf_w (__m256i a, __m256i b, __m256i c)\n#include <lasxintrin.h>\nInstruction: xvshuf.w xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nShuffle 32-bit elements in `b` and `c` with indices from `a`, save the result to `dst`.\n\n![](../diagram/xvshuf_w.svg)\n\n\n### Examples\n\n```c++\n__m256i __lasx_xvshuf_w(__m256i{0x0000000200000004, 0x0000000700000005, 0x0000000100000003, 0x0000000400000000}, __m256i{0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{0xabcdef1314156678, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678})\n= 0x4321432155667788 0x99aabbcc11223344 0x1234123456785678 0x1234123443214321\n```\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  if ((a.word[i] % 256) >= 64 && MACHINE_3C5000) {\n    // Caveat: observed in 3C5000\n    dst.word[i] = 0;\n  } else if ((a.word[i] % 8) < 4) {\n    dst.word[i] = c.word[(a.word[i] % 8) + ((i >= 4) ? 4 : 0)];\n  } else {\n    dst.word[i] = b.word[(a.word[i] % 8) + ((i >= 4) ? 0 : -4)];\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 2 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvshuf_w (__m256i a, __m256i b, __m256i c)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvshuf.w xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Shuffle 32-bit elements in <code>b</code> and <code>c</code> with indices from <code>a</code>, save the result to <code>dst</code>.</p>\n<p><img alt=\"\" src=\"../diagram/xvshuf_w.svg\" /></p>\n<h3>Examples</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvshuf_w(__m256i{0x0000000200000004, 0x0000000700000005, 0x0000000100000003, 0x0000000400000000}, __m256i{0x1122334455667788, 0x99aabbccddeeff00, 0xabcdef1212341234, 0xaabbaabbddeeddee}, __m256i{0xabcdef1314156678, 0x1234123443214321, 0x1234123443214321, 0x5678567856785678})\n= 0x4321432155667788 0x99aabbcc11223344 0x1234123456785678 0x1234123443214321\n</code></pre>\n\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  if ((a.word[i] % 256) &gt;= 64 &amp;&amp; MACHINE_3C5000) {\n    // Caveat: observed in 3C5000\n    dst.word[i] = 0;\n  } else if ((a.word[i] % 8) &lt; 4) {\n    dst.word[i] = c.word[(a.word[i] % 8) + ((i &gt;= 4) ? 4 : 0)];\n  } else {\n    dst.word[i] = b.word[(a.word[i] % 8) + ((i &gt;= 4) ? 0 : -4)];\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shuffling", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsigncov_b (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsigncov_b (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvsigncov.b xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nIf the 8-bit element in `a` equals to zero, set the result to zero. If the signed 8-bit element in `a` is positive, copy element in `b` to result. Otherwise, copy negated element in `b` to result. If `a` and `b` are the same vectors, it is equivalent to computing absolute value.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 32; i++) {\n  dst.byte[i] =\n      (a.byte[i] == 0) ? 0 : ((s8)a.byte[i] > 0 ? b.byte[i] : -b.byte[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 2 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsigncov_b (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsigncov.b xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>If the 8-bit element in <code>a</code> equals to zero, set the result to zero. If the signed 8-bit element in <code>a</code> is positive, copy element in <code>b</code> to result. Otherwise, copy negated element in <code>b</code> to result. If <code>a</code> and <code>b</code> are the same vectors, it is equivalent to computing absolute value.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 32; i++) {\n  dst.byte[i] =\n      (a.byte[i] == 0) ? 0 : ((s8)a.byte[i] &gt; 0 ? b.byte[i] : -b.byte[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsigncov_d (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsigncov_d (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvsigncov.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nIf the 64-bit element in `a` equals to zero, set the result to zero. If the signed 64-bit element in `a` is positive, copy element in `b` to result. Otherwise, copy negated element in `b` to result. If `a` and `b` are the same vectors, it is equivalent to computing absolute value.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] =\n      (a.dword[i] == 0) ? 0 : ((s64)a.dword[i] > 0 ? b.dword[i] : -b.dword[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 2 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsigncov_d (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsigncov.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>If the 64-bit element in <code>a</code> equals to zero, set the result to zero. If the signed 64-bit element in <code>a</code> is positive, copy element in <code>b</code> to result. Otherwise, copy negated element in <code>b</code> to result. If <code>a</code> and <code>b</code> are the same vectors, it is equivalent to computing absolute value.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] =\n      (a.dword[i] == 0) ? 0 : ((s64)a.dword[i] &gt; 0 ? b.dword[i] : -b.dword[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsigncov_h (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsigncov_h (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvsigncov.h xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nIf the 16-bit element in `a` equals to zero, set the result to zero. If the signed 16-bit element in `a` is positive, copy element in `b` to result. Otherwise, copy negated element in `b` to result. If `a` and `b` are the same vectors, it is equivalent to computing absolute value.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] =\n      (a.half[i] == 0) ? 0 : ((s16)a.half[i] > 0 ? b.half[i] : -b.half[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 2 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsigncov_h (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsigncov.h xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>If the 16-bit element in <code>a</code> equals to zero, set the result to zero. If the signed 16-bit element in <code>a</code> is positive, copy element in <code>b</code> to result. Otherwise, copy negated element in <code>b</code> to result. If <code>a</code> and <code>b</code> are the same vectors, it is equivalent to computing absolute value.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] =\n      (a.half[i] == 0) ? 0 : ((s16)a.half[i] &gt; 0 ? b.half[i] : -b.half[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsigncov_w (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsigncov_w (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvsigncov.w xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nIf the 32-bit element in `a` equals to zero, set the result to zero. If the signed 32-bit element in `a` is positive, copy element in `b` to result. Otherwise, copy negated element in `b` to result. If `a` and `b` are the same vectors, it is equivalent to computing absolute value.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] =\n      (a.word[i] == 0) ? 0 : ((s32)a.word[i] > 0 ? b.word[i] : -b.word[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 2 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsigncov_w (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsigncov.w xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>If the 32-bit element in <code>a</code> equals to zero, set the result to zero. If the signed 32-bit element in <code>a</code> is positive, copy element in <code>b</code> to result. Otherwise, copy negated element in <code>b</code> to result. If <code>a</code> and <code>b</code> are the same vectors, it is equivalent to computing absolute value.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] =\n      (a.word[i] == 0) ? 0 : ((s32)a.word[i] &gt; 0 ? b.word[i] : -b.word[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsle_b (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsle_b (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvsle.b xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompare the signed 8-bit elements in `a` and `b`, store all-ones to `dst` if corresponding element in `a` is less than or equal `b`, zero otherwise.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 32; i++) {\n  dst.byte[i] = ((s8)a.byte[i] <= (s8)b.byte[i]) ? 0xFF : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsle_b (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsle.b xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare the signed 8-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than or equal <code>b</code>, zero otherwise.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 32; i++) {\n  dst.byte[i] = ((s8)a.byte[i] &lt;= (s8)b.byte[i]) ? 0xFF : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Comparison", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsle_bu (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsle_bu (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvsle.bu xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompare the unsigned 8-bit elements in `a` and `b`, store all-ones to `dst` if corresponding element in `a` is less than or equal `b`, zero otherwise.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 32; i++) {\n  dst.byte[i] = ((u8)a.byte[i] <= (u8)b.byte[i]) ? 0xFF : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsle_bu (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsle.bu xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare the unsigned 8-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than or equal <code>b</code>, zero otherwise.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 32; i++) {\n  dst.byte[i] = ((u8)a.byte[i] &lt;= (u8)b.byte[i]) ? 0xFF : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Comparison", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsle_d (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsle_d (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvsle.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompare the signed 64-bit elements in `a` and `b`, store all-ones to `dst` if corresponding element in `a` is less than or equal `b`, zero otherwise.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = ((s64)a.dword[i] <= (s64)b.dword[i]) ? 0xFFFFFFFFFFFFFFFF : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsle_d (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsle.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare the signed 64-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than or equal <code>b</code>, zero otherwise.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = ((s64)a.dword[i] &lt;= (s64)b.dword[i]) ? 0xFFFFFFFFFFFFFFFF : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Comparison", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsle_du (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsle_du (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvsle.du xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompare the unsigned 64-bit elements in `a` and `b`, store all-ones to `dst` if corresponding element in `a` is less than or equal `b`, zero otherwise.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = ((u64)a.dword[i] <= (u64)b.dword[i]) ? 0xFFFFFFFFFFFFFFFF : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsle_du (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsle.du xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare the unsigned 64-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than or equal <code>b</code>, zero otherwise.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = ((u64)a.dword[i] &lt;= (u64)b.dword[i]) ? 0xFFFFFFFFFFFFFFFF : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Comparison", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsle_h (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsle_h (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvsle.h xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompare the signed 16-bit elements in `a` and `b`, store all-ones to `dst` if corresponding element in `a` is less than or equal `b`, zero otherwise.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = ((s16)a.half[i] <= (s16)b.half[i]) ? 0xFFFF : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsle_h (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsle.h xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare the signed 16-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than or equal <code>b</code>, zero otherwise.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = ((s16)a.half[i] &lt;= (s16)b.half[i]) ? 0xFFFF : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Comparison", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsle_hu (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsle_hu (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvsle.hu xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompare the unsigned 16-bit elements in `a` and `b`, store all-ones to `dst` if corresponding element in `a` is less than or equal `b`, zero otherwise.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = ((u16)a.half[i] <= (u16)b.half[i]) ? 0xFFFF : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsle_hu (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsle.hu xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare the unsigned 16-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than or equal <code>b</code>, zero otherwise.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = ((u16)a.half[i] &lt;= (u16)b.half[i]) ? 0xFFFF : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Comparison", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsle_w (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsle_w (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvsle.w xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompare the signed 32-bit elements in `a` and `b`, store all-ones to `dst` if corresponding element in `a` is less than or equal `b`, zero otherwise.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = ((s32)a.word[i] <= (s32)b.word[i]) ? 0xFFFFFFFF : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsle_w (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsle.w xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare the signed 32-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than or equal <code>b</code>, zero otherwise.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = ((s32)a.word[i] &lt;= (s32)b.word[i]) ? 0xFFFFFFFF : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Comparison", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsle_wu (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsle_wu (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvsle.wu xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompare the unsigned 32-bit elements in `a` and `b`, store all-ones to `dst` if corresponding element in `a` is less than or equal `b`, zero otherwise.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = ((u32)a.word[i] <= (u32)b.word[i]) ? 0xFFFFFFFF : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsle_wu (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsle.wu xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare the unsigned 32-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than or equal <code>b</code>, zero otherwise.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = ((u32)a.word[i] &lt;= (u32)b.word[i]) ? 0xFFFFFFFF : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Comparison", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvslei_b (__m256i a, imm_n16_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvslei_b (__m256i a, imm_n16_15 imm)\n#include <lasxintrin.h>\nInstruction: xvslei.b xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nCompare the signed 8-bit elements in `a` and `b`, store all-ones to `dst` if corresponding element in `a` is less than or equal `b`, zero otherwise.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 32; i++) {\n  dst.byte[i] = ((s8)a.byte[i] <= imm) ? 0xFF : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvslei_b (__m256i a, imm_n16_15 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvslei.b xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare the signed 8-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than or equal <code>b</code>, zero otherwise.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 32; i++) {\n  dst.byte[i] = ((s8)a.byte[i] &lt;= imm) ? 0xFF : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Comparison", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvslei_bu (__m256i a, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvslei_bu (__m256i a, imm0_31 imm)\n#include <lasxintrin.h>\nInstruction: xvslei.bu xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nCompare the unsigned 8-bit elements in `a` and `b`, store all-ones to `dst` if corresponding element in `a` is less than or equal `b`, zero otherwise.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 32; i++) {\n  dst.byte[i] = ((u8)a.byte[i] <= imm) ? 0xFF : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvslei_bu (__m256i a, imm0_31 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvslei.bu xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare the unsigned 8-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than or equal <code>b</code>, zero otherwise.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 32; i++) {\n  dst.byte[i] = ((u8)a.byte[i] &lt;= imm) ? 0xFF : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Comparison", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvslei_d (__m256i a, imm_n16_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvslei_d (__m256i a, imm_n16_15 imm)\n#include <lasxintrin.h>\nInstruction: xvslei.d xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nCompare the signed 64-bit elements in `a` and `b`, store all-ones to `dst` if corresponding element in `a` is less than or equal `b`, zero otherwise.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = ((s64)a.dword[i] <= imm) ? 0xFFFFFFFFFFFFFFFF : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvslei_d (__m256i a, imm_n16_15 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvslei.d xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare the signed 64-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than or equal <code>b</code>, zero otherwise.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = ((s64)a.dword[i] &lt;= imm) ? 0xFFFFFFFFFFFFFFFF : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Comparison", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvslei_du (__m256i a, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvslei_du (__m256i a, imm0_31 imm)\n#include <lasxintrin.h>\nInstruction: xvslei.du xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nCompare the unsigned 64-bit elements in `a` and `b`, store all-ones to `dst` if corresponding element in `a` is less than or equal `b`, zero otherwise.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = ((u64)a.dword[i] <= imm) ? 0xFFFFFFFFFFFFFFFF : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvslei_du (__m256i a, imm0_31 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvslei.du xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare the unsigned 64-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than or equal <code>b</code>, zero otherwise.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = ((u64)a.dword[i] &lt;= imm) ? 0xFFFFFFFFFFFFFFFF : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Comparison", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvslei_h (__m256i a, imm_n16_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvslei_h (__m256i a, imm_n16_15 imm)\n#include <lasxintrin.h>\nInstruction: xvslei.h xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nCompare the signed 16-bit elements in `a` and `b`, store all-ones to `dst` if corresponding element in `a` is less than or equal `b`, zero otherwise.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = ((s16)a.half[i] <= imm) ? 0xFFFF : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvslei_h (__m256i a, imm_n16_15 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvslei.h xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare the signed 16-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than or equal <code>b</code>, zero otherwise.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = ((s16)a.half[i] &lt;= imm) ? 0xFFFF : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Comparison", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvslei_hu (__m256i a, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvslei_hu (__m256i a, imm0_31 imm)\n#include <lasxintrin.h>\nInstruction: xvslei.hu xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nCompare the unsigned 16-bit elements in `a` and `b`, store all-ones to `dst` if corresponding element in `a` is less than or equal `b`, zero otherwise.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = ((u16)a.half[i] <= imm) ? 0xFFFF : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvslei_hu (__m256i a, imm0_31 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvslei.hu xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare the unsigned 16-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than or equal <code>b</code>, zero otherwise.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = ((u16)a.half[i] &lt;= imm) ? 0xFFFF : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Comparison", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvslei_w (__m256i a, imm_n16_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvslei_w (__m256i a, imm_n16_15 imm)\n#include <lasxintrin.h>\nInstruction: xvslei.w xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nCompare the signed 32-bit elements in `a` and `b`, store all-ones to `dst` if corresponding element in `a` is less than or equal `b`, zero otherwise.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = ((s32)a.word[i] <= imm) ? 0xFFFFFFFF : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvslei_w (__m256i a, imm_n16_15 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvslei.w xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare the signed 32-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than or equal <code>b</code>, zero otherwise.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = ((s32)a.word[i] &lt;= imm) ? 0xFFFFFFFF : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Comparison", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvslei_wu (__m256i a, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvslei_wu (__m256i a, imm0_31 imm)\n#include <lasxintrin.h>\nInstruction: xvslei.wu xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nCompare the unsigned 32-bit elements in `a` and `b`, store all-ones to `dst` if corresponding element in `a` is less than or equal `b`, zero otherwise.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = ((u32)a.word[i] <= imm) ? 0xFFFFFFFF : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvslei_wu (__m256i a, imm0_31 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvslei.wu xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare the unsigned 32-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than or equal <code>b</code>, zero otherwise.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = ((u32)a.word[i] &lt;= imm) ? 0xFFFFFFFF : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Comparison", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsll_b (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsll_b (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvsll.b xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nLogical left shift the unsigned 8-bit elements in `a` by elements in `b`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 32; i++) {\n  dst.byte[i] = a.byte[i] << (b.byte[i] & 0x7);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsll_b (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsll.b xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical left shift the unsigned 8-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 32; i++) {\n  dst.byte[i] = a.byte[i] &lt;&lt; (b.byte[i] &amp; 0x7);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsll_d (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsll_d (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvsll.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nLogical left shift the unsigned 64-bit elements in `a` by elements in `b`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = a.dword[i] << (b.dword[i] & 0x3f);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsll_d (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsll.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical left shift the unsigned 64-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = a.dword[i] &lt;&lt; (b.dword[i] &amp; 0x3f);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsll_h (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsll_h (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvsll.h xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nLogical left shift the unsigned 16-bit elements in `a` by elements in `b`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = a.half[i] << (b.half[i] & 0xf);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsll_h (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsll.h xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical left shift the unsigned 16-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = a.half[i] &lt;&lt; (b.half[i] &amp; 0xf);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsll_w (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsll_w (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvsll.w xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nLogical left shift the unsigned 32-bit elements in `a` by elements in `b`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = a.word[i] << (b.word[i] & 0x1f);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsll_w (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsll.w xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical left shift the unsigned 32-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = a.word[i] &lt;&lt; (b.word[i] &amp; 0x1f);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvslli_b (__m256i a, imm0_7 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvslli_b (__m256i a, imm0_7 imm)\n#include <lasxintrin.h>\nInstruction: xvslli.b xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nLogical left shift the unsigned 8-bit elements in `a` by `imm`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 32; i++) {\n  dst.byte[i] = a.byte[i] << imm;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvslli_b (__m256i a, imm0_7 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvslli.b xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical left shift the unsigned 8-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 32; i++) {\n  dst.byte[i] = a.byte[i] &lt;&lt; imm;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvslli_d (__m256i a, imm0_63 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvslli_d (__m256i a, imm0_63 imm)\n#include <lasxintrin.h>\nInstruction: xvslli.d xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nLogical left shift the unsigned 64-bit elements in `a` by `imm`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = a.dword[i] << imm;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvslli_d (__m256i a, imm0_63 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvslli.d xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical left shift the unsigned 64-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = a.dword[i] &lt;&lt; imm;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvslli_h (__m256i a, imm0_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvslli_h (__m256i a, imm0_15 imm)\n#include <lasxintrin.h>\nInstruction: xvslli.h xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nLogical left shift the unsigned 16-bit elements in `a` by `imm`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = a.half[i] << imm;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvslli_h (__m256i a, imm0_15 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvslli.h xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical left shift the unsigned 16-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = a.half[i] &lt;&lt; imm;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvslli_w (__m256i a, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvslli_w (__m256i a, imm0_31 imm)\n#include <lasxintrin.h>\nInstruction: xvslli.w xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nLogical left shift the unsigned 32-bit elements in `a` by `imm`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = a.word[i] << imm;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvslli_w (__m256i a, imm0_31 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvslli.w xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical left shift the unsigned 32-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = a.word[i] &lt;&lt; imm;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsllwil_d_w (__m256i a, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsllwil_d_w (__m256i a, imm0_31 imm)\n#include <lasxintrin.h>\nInstruction: xvsllwil.d.w xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nExtend and shift signed 32-bit elements in `a` by `imm` to signed 64-bit result.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = (s64)(s32)a.word[i] << imm;\n}\nfor (int i = 2; i < 4; i++) {\n  dst.dword[i] = (s64)(s32)a.word[i + 2] << imm;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsllwil_d_w (__m256i a, imm0_31 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsllwil.d.w xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Extend and shift signed 32-bit elements in <code>a</code> by <code>imm</code> to signed 64-bit result.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = (s64)(s32)a.word[i] &lt;&lt; imm;\n}\nfor (int i = 2; i &lt; 4; i++) {\n  dst.dword[i] = (s64)(s32)a.word[i + 2] &lt;&lt; imm;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsllwil_du_wu (__m256i a, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsllwil_du_wu (__m256i a, imm0_31 imm)\n#include <lasxintrin.h>\nInstruction: xvsllwil.du.wu xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nExtend and shift unsigned 32-bit elements in `a` by `imm` to unsigned 64-bit result.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = (u64)(u32)a.word[i] << imm;\n}\nfor (int i = 2; i < 4; i++) {\n  dst.dword[i] = (u64)(u32)a.word[i + 2] << imm;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsllwil_du_wu (__m256i a, imm0_31 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsllwil.du.wu xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Extend and shift unsigned 32-bit elements in <code>a</code> by <code>imm</code> to unsigned 64-bit result.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = (u64)(u32)a.word[i] &lt;&lt; imm;\n}\nfor (int i = 2; i &lt; 4; i++) {\n  dst.dword[i] = (u64)(u32)a.word[i + 2] &lt;&lt; imm;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsllwil_h_b (__m256i a, imm0_7 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsllwil_h_b (__m256i a, imm0_7 imm)\n#include <lasxintrin.h>\nInstruction: xvsllwil.h.b xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nExtend and shift signed 8-bit elements in `a` by `imm` to signed 16-bit result.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = (s16)(s8)a.byte[i] << imm;\n}\nfor (int i = 8; i < 16; i++) {\n  dst.half[i] = (s16)(s8)a.byte[i + 8] << imm;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsllwil_h_b (__m256i a, imm0_7 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsllwil.h.b xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Extend and shift signed 8-bit elements in <code>a</code> by <code>imm</code> to signed 16-bit result.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = (s16)(s8)a.byte[i] &lt;&lt; imm;\n}\nfor (int i = 8; i &lt; 16; i++) {\n  dst.half[i] = (s16)(s8)a.byte[i + 8] &lt;&lt; imm;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsllwil_hu_bu (__m256i a, imm0_7 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsllwil_hu_bu (__m256i a, imm0_7 imm)\n#include <lasxintrin.h>\nInstruction: xvsllwil.hu.bu xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nExtend and shift unsigned 8-bit elements in `a` by `imm` to unsigned 16-bit result.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = (u16)(u8)a.byte[i] << imm;\n}\nfor (int i = 8; i < 16; i++) {\n  dst.half[i] = (u16)(u8)a.byte[i + 8] << imm;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsllwil_hu_bu (__m256i a, imm0_7 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsllwil.hu.bu xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Extend and shift unsigned 8-bit elements in <code>a</code> by <code>imm</code> to unsigned 16-bit result.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = (u16)(u8)a.byte[i] &lt;&lt; imm;\n}\nfor (int i = 8; i &lt; 16; i++) {\n  dst.half[i] = (u16)(u8)a.byte[i + 8] &lt;&lt; imm;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsllwil_w_h (__m256i a, imm0_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsllwil_w_h (__m256i a, imm0_15 imm)\n#include <lasxintrin.h>\nInstruction: xvsllwil.w.h xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nExtend and shift signed 16-bit elements in `a` by `imm` to signed 32-bit result.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = (s32)(s16)a.half[i] << imm;\n}\nfor (int i = 4; i < 8; i++) {\n  dst.word[i] = (s32)(s16)a.half[i + 4] << imm;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsllwil_w_h (__m256i a, imm0_15 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsllwil.w.h xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Extend and shift signed 16-bit elements in <code>a</code> by <code>imm</code> to signed 32-bit result.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = (s32)(s16)a.half[i] &lt;&lt; imm;\n}\nfor (int i = 4; i &lt; 8; i++) {\n  dst.word[i] = (s32)(s16)a.half[i + 4] &lt;&lt; imm;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsllwil_wu_hu (__m256i a, imm0_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsllwil_wu_hu (__m256i a, imm0_15 imm)\n#include <lasxintrin.h>\nInstruction: xvsllwil.wu.hu xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nExtend and shift unsigned 16-bit elements in `a` by `imm` to unsigned 32-bit result.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = (u32)(u16)a.half[i] << imm;\n}\nfor (int i = 4; i < 8; i++) {\n  dst.word[i] = (u32)(u16)a.half[i + 4] << imm;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsllwil_wu_hu (__m256i a, imm0_15 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsllwil.wu.hu xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Extend and shift unsigned 16-bit elements in <code>a</code> by <code>imm</code> to unsigned 32-bit result.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = (u32)(u16)a.half[i] &lt;&lt; imm;\n}\nfor (int i = 4; i &lt; 8; i++) {\n  dst.word[i] = (u32)(u16)a.half[i + 4] &lt;&lt; imm;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvslt_b (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvslt_b (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvslt.b xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompare the signed 8-bit elements in `a` and `b`, store all-ones to `dst` if corresponding element in `a` is less than `b`, zero otherwise.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 32; i++) {\n  dst.byte[i] = ((s8)a.byte[i] < (s8)b.byte[i]) ? 0xFF : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvslt_b (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvslt.b xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare the signed 8-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than <code>b</code>, zero otherwise.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 32; i++) {\n  dst.byte[i] = ((s8)a.byte[i] &lt; (s8)b.byte[i]) ? 0xFF : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Comparison", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvslt_bu (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvslt_bu (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvslt.bu xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompare the unsigned 8-bit elements in `a` and `b`, store all-ones to `dst` if corresponding element in `a` is less than `b`, zero otherwise.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 32; i++) {\n  dst.byte[i] = ((u8)a.byte[i] < (u8)b.byte[i]) ? 0xFF : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvslt_bu (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvslt.bu xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare the unsigned 8-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than <code>b</code>, zero otherwise.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 32; i++) {\n  dst.byte[i] = ((u8)a.byte[i] &lt; (u8)b.byte[i]) ? 0xFF : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Comparison", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvslt_d (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvslt_d (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvslt.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompare the signed 64-bit elements in `a` and `b`, store all-ones to `dst` if corresponding element in `a` is less than `b`, zero otherwise.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = ((s64)a.dword[i] < (s64)b.dword[i]) ? 0xFFFFFFFFFFFFFFFF : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvslt_d (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvslt.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare the signed 64-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than <code>b</code>, zero otherwise.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = ((s64)a.dword[i] &lt; (s64)b.dword[i]) ? 0xFFFFFFFFFFFFFFFF : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Comparison", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvslt_du (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvslt_du (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvslt.du xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompare the unsigned 64-bit elements in `a` and `b`, store all-ones to `dst` if corresponding element in `a` is less than `b`, zero otherwise.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = ((u64)a.dword[i] < (u64)b.dword[i]) ? 0xFFFFFFFFFFFFFFFF : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvslt_du (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvslt.du xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare the unsigned 64-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than <code>b</code>, zero otherwise.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = ((u64)a.dword[i] &lt; (u64)b.dword[i]) ? 0xFFFFFFFFFFFFFFFF : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Comparison", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvslt_h (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvslt_h (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvslt.h xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompare the signed 16-bit elements in `a` and `b`, store all-ones to `dst` if corresponding element in `a` is less than `b`, zero otherwise.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = ((s16)a.half[i] < (s16)b.half[i]) ? 0xFFFF : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvslt_h (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvslt.h xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare the signed 16-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than <code>b</code>, zero otherwise.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = ((s16)a.half[i] &lt; (s16)b.half[i]) ? 0xFFFF : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Comparison", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvslt_hu (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvslt_hu (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvslt.hu xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompare the unsigned 16-bit elements in `a` and `b`, store all-ones to `dst` if corresponding element in `a` is less than `b`, zero otherwise.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = ((u16)a.half[i] < (u16)b.half[i]) ? 0xFFFF : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvslt_hu (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvslt.hu xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare the unsigned 16-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than <code>b</code>, zero otherwise.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = ((u16)a.half[i] &lt; (u16)b.half[i]) ? 0xFFFF : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Comparison", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvslt_w (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvslt_w (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvslt.w xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompare the signed 32-bit elements in `a` and `b`, store all-ones to `dst` if corresponding element in `a` is less than `b`, zero otherwise.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = ((s32)a.word[i] < (s32)b.word[i]) ? 0xFFFFFFFF : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvslt_w (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvslt.w xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare the signed 32-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than <code>b</code>, zero otherwise.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = ((s32)a.word[i] &lt; (s32)b.word[i]) ? 0xFFFFFFFF : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Comparison", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvslt_wu (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvslt_wu (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvslt.wu xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompare the unsigned 32-bit elements in `a` and `b`, store all-ones to `dst` if corresponding element in `a` is less than `b`, zero otherwise.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = ((u32)a.word[i] < (u32)b.word[i]) ? 0xFFFFFFFF : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvslt_wu (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvslt.wu xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare the unsigned 32-bit elements in <code>a</code> and <code>b</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than <code>b</code>, zero otherwise.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = ((u32)a.word[i] &lt; (u32)b.word[i]) ? 0xFFFFFFFF : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Comparison", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvslti_b (__m256i a, imm_n16_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvslti_b (__m256i a, imm_n16_15 imm)\n#include <lasxintrin.h>\nInstruction: xvslti.b xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nCompare the signed 8-bit elements in `a` and `imm`, store all-ones to `dst` if corresponding element in `a` is less than `b`, zero otherwise.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 32; i++) {\n  dst.byte[i] = ((s8)a.byte[i] < imm) ? 0xFF : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvslti_b (__m256i a, imm_n16_15 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvslti.b xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare the signed 8-bit elements in <code>a</code> and <code>imm</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than <code>b</code>, zero otherwise.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 32; i++) {\n  dst.byte[i] = ((s8)a.byte[i] &lt; imm) ? 0xFF : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Comparison", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvslti_bu (__m256i a, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvslti_bu (__m256i a, imm0_31 imm)\n#include <lasxintrin.h>\nInstruction: xvslti.bu xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nCompare the unsigned 8-bit elements in `a` and `imm`, store all-ones to `dst` if corresponding element in `a` is less than `b`, zero otherwise.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 32; i++) {\n  dst.byte[i] = ((u8)a.byte[i] < imm) ? 0xFF : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvslti_bu (__m256i a, imm0_31 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvslti.bu xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare the unsigned 8-bit elements in <code>a</code> and <code>imm</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than <code>b</code>, zero otherwise.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 32; i++) {\n  dst.byte[i] = ((u8)a.byte[i] &lt; imm) ? 0xFF : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Comparison", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvslti_d (__m256i a, imm_n16_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvslti_d (__m256i a, imm_n16_15 imm)\n#include <lasxintrin.h>\nInstruction: xvslti.d xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nCompare the signed 64-bit elements in `a` and `imm`, store all-ones to `dst` if corresponding element in `a` is less than `b`, zero otherwise.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = ((s64)a.dword[i] < imm) ? 0xFFFFFFFFFFFFFFFF : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvslti_d (__m256i a, imm_n16_15 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvslti.d xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare the signed 64-bit elements in <code>a</code> and <code>imm</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than <code>b</code>, zero otherwise.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = ((s64)a.dword[i] &lt; imm) ? 0xFFFFFFFFFFFFFFFF : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Comparison", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvslti_du (__m256i a, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvslti_du (__m256i a, imm0_31 imm)\n#include <lasxintrin.h>\nInstruction: xvslti.du xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nCompare the unsigned 64-bit elements in `a` and `imm`, store all-ones to `dst` if corresponding element in `a` is less than `b`, zero otherwise.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = ((u64)a.dword[i] < imm) ? 0xFFFFFFFFFFFFFFFF : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 4 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvslti_du (__m256i a, imm0_31 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvslti.du xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare the unsigned 64-bit elements in <code>a</code> and <code>imm</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than <code>b</code>, zero otherwise.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = ((u64)a.dword[i] &lt; imm) ? 0xFFFFFFFFFFFFFFFF : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Comparison", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvslti_h (__m256i a, imm_n16_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvslti_h (__m256i a, imm_n16_15 imm)\n#include <lasxintrin.h>\nInstruction: xvslti.h xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nCompare the signed 16-bit elements in `a` and `imm`, store all-ones to `dst` if corresponding element in `a` is less than `b`, zero otherwise.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = ((s16)a.half[i] < imm) ? 0xFFFF : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvslti_h (__m256i a, imm_n16_15 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvslti.h xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare the signed 16-bit elements in <code>a</code> and <code>imm</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than <code>b</code>, zero otherwise.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = ((s16)a.half[i] &lt; imm) ? 0xFFFF : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Comparison", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvslti_hu (__m256i a, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvslti_hu (__m256i a, imm0_31 imm)\n#include <lasxintrin.h>\nInstruction: xvslti.hu xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nCompare the unsigned 16-bit elements in `a` and `imm`, store all-ones to `dst` if corresponding element in `a` is less than `b`, zero otherwise.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = ((u16)a.half[i] < imm) ? 0xFFFF : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvslti_hu (__m256i a, imm0_31 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvslti.hu xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare the unsigned 16-bit elements in <code>a</code> and <code>imm</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than <code>b</code>, zero otherwise.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = ((u16)a.half[i] &lt; imm) ? 0xFFFF : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Comparison", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvslti_w (__m256i a, imm_n16_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvslti_w (__m256i a, imm_n16_15 imm)\n#include <lasxintrin.h>\nInstruction: xvslti.w xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nCompare the signed 32-bit elements in `a` and `imm`, store all-ones to `dst` if corresponding element in `a` is less than `b`, zero otherwise.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = ((s32)a.word[i] < imm) ? 0xFFFFFFFF : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvslti_w (__m256i a, imm_n16_15 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvslti.w xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare the signed 32-bit elements in <code>a</code> and <code>imm</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than <code>b</code>, zero otherwise.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = ((s32)a.word[i] &lt; imm) ? 0xFFFFFFFF : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Comparison", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvslti_wu (__m256i a, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvslti_wu (__m256i a, imm0_31 imm)\n#include <lasxintrin.h>\nInstruction: xvslti.wu xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nCompare the unsigned 32-bit elements in `a` and `imm`, store all-ones to `dst` if corresponding element in `a` is less than `b`, zero otherwise.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = ((u32)a.word[i] < imm) ? 0xFFFFFFFF : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvslti_wu (__m256i a, imm0_31 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvslti.wu xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compare the unsigned 32-bit elements in <code>a</code> and <code>imm</code>, store all-ones to <code>dst</code> if corresponding element in <code>a</code> is less than <code>b</code>, zero otherwise.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = ((u32)a.word[i] &lt; imm) ? 0xFFFFFFFF : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Comparison", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsra_b (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsra_b (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvsra.b xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nArithmetic right shift the signed 8-bit elements in `a` by elements in `b`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 32; i++) {\n  dst.byte[i] = ((s8)a.byte[i]) >> (b.byte[i] & 0x7);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsra_b (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsra.b xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift the signed 8-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 32; i++) {\n  dst.byte[i] = ((s8)a.byte[i]) &gt;&gt; (b.byte[i] &amp; 0x7);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsra_d (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsra_d (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvsra.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nArithmetic right shift the signed 64-bit elements in `a` by elements in `b`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = ((s64)a.dword[i]) >> (b.dword[i] & 0x3f);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsra_d (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsra.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift the signed 64-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = ((s64)a.dword[i]) &gt;&gt; (b.dword[i] &amp; 0x3f);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsra_h (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsra_h (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvsra.h xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nArithmetic right shift the signed 16-bit elements in `a` by elements in `b`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = ((s16)a.half[i]) >> (b.half[i] & 0xf);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsra_h (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsra.h xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift the signed 16-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = ((s16)a.half[i]) &gt;&gt; (b.half[i] &amp; 0xf);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsra_w (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsra_w (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvsra.w xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nArithmetic right shift the signed 32-bit elements in `a` by elements in `b`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = ((s32)a.word[i]) >> (b.word[i] & 0x1f);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsra_w (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsra.w xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift the signed 32-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = ((s32)a.word[i]) &gt;&gt; (b.word[i] &amp; 0x1f);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsrai_b (__m256i a, imm0_7 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsrai_b (__m256i a, imm0_7 imm)\n#include <lasxintrin.h>\nInstruction: xvsrai.b xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nArithmetic right shift the signed 8-bit elements in `a` by `imm`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 32; i++) {\n  dst.byte[i] = ((s8)a.byte[i]) >> imm;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsrai_b (__m256i a, imm0_7 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsrai.b xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift the signed 8-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 32; i++) {\n  dst.byte[i] = ((s8)a.byte[i]) &gt;&gt; imm;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsrai_d (__m256i a, imm0_63 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsrai_d (__m256i a, imm0_63 imm)\n#include <lasxintrin.h>\nInstruction: xvsrai.d xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nArithmetic right shift the signed 64-bit elements in `a` by `imm`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = ((s64)a.dword[i]) >> imm;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsrai_d (__m256i a, imm0_63 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsrai.d xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift the signed 64-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = ((s64)a.dword[i]) &gt;&gt; imm;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsrai_h (__m256i a, imm0_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsrai_h (__m256i a, imm0_15 imm)\n#include <lasxintrin.h>\nInstruction: xvsrai.h xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nArithmetic right shift the signed 16-bit elements in `a` by `imm`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = ((s16)a.half[i]) >> imm;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsrai_h (__m256i a, imm0_15 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsrai.h xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift the signed 16-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = ((s16)a.half[i]) &gt;&gt; imm;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsrai_w (__m256i a, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsrai_w (__m256i a, imm0_31 imm)\n#include <lasxintrin.h>\nInstruction: xvsrai.w xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nArithmetic right shift the signed 32-bit elements in `a` by `imm`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = ((s32)a.word[i]) >> imm;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsrai_w (__m256i a, imm0_31 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsrai.w xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift the signed 32-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = ((s32)a.word[i]) &gt;&gt; imm;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsran_b_h (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsran_b_h (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvsran.b.h xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nArithmetic right shift the signed 16-bit elements in `a` by elements in `b`, truncate to 8-bit and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = (i < 8) ? (s8)((s16)a.half[i] >> (b.half[i] & 15)) : 0;\n}\nfor (int i = 16; i < 32; i++) {\n  dst.byte[i] = (i < 24) ? (s8)((s16)a.half[i - 8] >> (b.half[i - 8] & 15)) : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsran_b_h (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsran.b.h xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift the signed 16-bit elements in <code>a</code> by elements in <code>b</code>, truncate to 8-bit and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = (i &lt; 8) ? (s8)((s16)a.half[i] &gt;&gt; (b.half[i] &amp; 15)) : 0;\n}\nfor (int i = 16; i &lt; 32; i++) {\n  dst.byte[i] = (i &lt; 24) ? (s8)((s16)a.half[i - 8] &gt;&gt; (b.half[i - 8] &amp; 15)) : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsran_h_w (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsran_h_w (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvsran.h.w xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nArithmetic right shift the signed 32-bit elements in `a` by elements in `b`, truncate to 16-bit and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = (i < 4) ? (s16)((s32)a.word[i] >> (b.word[i] & 31)) : 0;\n}\nfor (int i = 8; i < 16; i++) {\n  dst.half[i] =\n      (i < 12) ? (s16)((s32)a.word[i - 4] >> (b.word[i - 4] & 31)) : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsran_h_w (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsran.h.w xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift the signed 32-bit elements in <code>a</code> by elements in <code>b</code>, truncate to 16-bit and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = (i &lt; 4) ? (s16)((s32)a.word[i] &gt;&gt; (b.word[i] &amp; 31)) : 0;\n}\nfor (int i = 8; i &lt; 16; i++) {\n  dst.half[i] =\n      (i &lt; 12) ? (s16)((s32)a.word[i - 4] &gt;&gt; (b.word[i - 4] &amp; 31)) : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsran_w_d (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsran_w_d (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvsran.w.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nArithmetic right shift the signed 64-bit elements in `a` by elements in `b`, truncate to 32-bit and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = (i < 2) ? (s32)((s64)a.dword[i] >> (b.dword[i] & 63)) : 0;\n}\nfor (int i = 4; i < 8; i++) {\n  dst.word[i] =\n      (i < 6) ? (s32)((s64)a.dword[i - 2] >> (b.dword[i - 2] & 63)) : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsran_w_d (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsran.w.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift the signed 64-bit elements in <code>a</code> by elements in <code>b</code>, truncate to 32-bit and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = (i &lt; 2) ? (s32)((s64)a.dword[i] &gt;&gt; (b.dword[i] &amp; 63)) : 0;\n}\nfor (int i = 4; i &lt; 8; i++) {\n  dst.word[i] =\n      (i &lt; 6) ? (s32)((s64)a.dword[i - 2] &gt;&gt; (b.dword[i - 2] &amp; 63)) : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsrani_b_h (__m256i a, __m256i b, imm0_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsrani_b_h (__m256i a, __m256i b, imm0_15 imm)\n#include <lasxintrin.h>\nInstruction: xvsrani.b.h xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nArithmetic right shift the signed 16-bit elements in `a` and `b` by `imm`, truncate to 8-bit and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] =\n      (i < 8) ? (s8)((s16)b.half[i] >> imm) : (s8)((s16)a.half[i - 8] >> imm);\n}\nfor (int i = 16; i < 32; i++) {\n  dst.byte[i] = (i < 24) ? (s8)((s16)b.half[i - 8] >> imm)\n                         : (s8)((s16)a.half[i - 16] >> imm);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsrani_b_h (__m256i a, __m256i b, imm0_15 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsrani.b.h xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift the signed 16-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, truncate to 8-bit and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] =\n      (i &lt; 8) ? (s8)((s16)b.half[i] &gt;&gt; imm) : (s8)((s16)a.half[i - 8] &gt;&gt; imm);\n}\nfor (int i = 16; i &lt; 32; i++) {\n  dst.byte[i] = (i &lt; 24) ? (s8)((s16)b.half[i - 8] &gt;&gt; imm)\n                         : (s8)((s16)a.half[i - 16] &gt;&gt; imm);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsrani_d_q (__m256i a, __m256i b, imm0_127 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsrani_d_q (__m256i a, __m256i b, imm0_127 imm)\n#include <lasxintrin.h>\nInstruction: xvsrani.d.q xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nArithmetic right shift the signed 128-bit elements in `a` and `b` by `imm`, truncate to 64-bit and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = (i < 1) ? (s64)((s128)b.qword[i] >> imm)\n                         : (s64)((s128)a.qword[i - 1] >> imm);\n}\nfor (int i = 2; i < 4; i++) {\n  dst.dword[i] = (i < 3) ? (s64)((s128)b.qword[i - 1] >> imm)\n                         : (s64)((s128)a.qword[i - 2] >> imm);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsrani_d_q (__m256i a, __m256i b, imm0_127 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsrani.d.q xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift the signed 128-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, truncate to 64-bit and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = (i &lt; 1) ? (s64)((s128)b.qword[i] &gt;&gt; imm)\n                         : (s64)((s128)a.qword[i - 1] &gt;&gt; imm);\n}\nfor (int i = 2; i &lt; 4; i++) {\n  dst.dword[i] = (i &lt; 3) ? (s64)((s128)b.qword[i - 1] &gt;&gt; imm)\n                         : (s64)((s128)a.qword[i - 2] &gt;&gt; imm);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsrani_h_w (__m256i a, __m256i b, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsrani_h_w (__m256i a, __m256i b, imm0_31 imm)\n#include <lasxintrin.h>\nInstruction: xvsrani.h.w xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nArithmetic right shift the signed 32-bit elements in `a` and `b` by `imm`, truncate to 16-bit and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] =\n      (i < 4) ? (s16)((s32)b.word[i] >> imm) : (s16)((s32)a.word[i - 4] >> imm);\n}\nfor (int i = 8; i < 16; i++) {\n  dst.half[i] = (i < 12) ? (s16)((s32)b.word[i - 4] >> imm)\n                         : (s16)((s32)a.word[i - 8] >> imm);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsrani_h_w (__m256i a, __m256i b, imm0_31 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsrani.h.w xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift the signed 32-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, truncate to 16-bit and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] =\n      (i &lt; 4) ? (s16)((s32)b.word[i] &gt;&gt; imm) : (s16)((s32)a.word[i - 4] &gt;&gt; imm);\n}\nfor (int i = 8; i &lt; 16; i++) {\n  dst.half[i] = (i &lt; 12) ? (s16)((s32)b.word[i - 4] &gt;&gt; imm)\n                         : (s16)((s32)a.word[i - 8] &gt;&gt; imm);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsrani_w_d (__m256i a, __m256i b, imm0_63 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsrani_w_d (__m256i a, __m256i b, imm0_63 imm)\n#include <lasxintrin.h>\nInstruction: xvsrani.w.d xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nArithmetic right shift the signed 64-bit elements in `a` and `b` by `imm`, truncate to 32-bit and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = (i < 2) ? (s32)((s64)b.dword[i] >> imm)\n                        : (s32)((s64)a.dword[i - 2] >> imm);\n}\nfor (int i = 4; i < 8; i++) {\n  dst.word[i] = (i < 6) ? (s32)((s64)b.dword[i - 2] >> imm)\n                        : (s32)((s64)a.dword[i - 4] >> imm);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsrani_w_d (__m256i a, __m256i b, imm0_63 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsrani.w.d xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift the signed 64-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, truncate to 32-bit and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = (i &lt; 2) ? (s32)((s64)b.dword[i] &gt;&gt; imm)\n                        : (s32)((s64)a.dword[i - 2] &gt;&gt; imm);\n}\nfor (int i = 4; i &lt; 8; i++) {\n  dst.word[i] = (i &lt; 6) ? (s32)((s64)b.dword[i - 2] &gt;&gt; imm)\n                        : (s32)((s64)a.dword[i - 4] &gt;&gt; imm);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsrar_b (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsrar_b (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvsrar.b xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nArithmetic right shift (with rounding) the signed 8-bit elements in `a` by elements in `b`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 32; i++) {\n  if ((b.byte[i] & 0x7) == 0) {\n    dst.byte[i] = a.byte[i];\n  } else {\n    dst.byte[i] = ((s8)a.byte[i] >> (b.byte[i] & 0x7)) +\n                  (((s8)a.byte[i] >> ((b.byte[i] & 0x7) - 1)) & 0x1);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsrar_b (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsrar.b xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift (with rounding) the signed 8-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 32; i++) {\n  if ((b.byte[i] &amp; 0x7) == 0) {\n    dst.byte[i] = a.byte[i];\n  } else {\n    dst.byte[i] = ((s8)a.byte[i] &gt;&gt; (b.byte[i] &amp; 0x7)) +\n                  (((s8)a.byte[i] &gt;&gt; ((b.byte[i] &amp; 0x7) - 1)) &amp; 0x1);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsrar_d (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsrar_d (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvsrar.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nArithmetic right shift (with rounding) the signed 64-bit elements in `a` by elements in `b`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if ((b.dword[i] & 0x3f) == 0) {\n    dst.dword[i] = a.dword[i];\n  } else {\n    dst.dword[i] = ((s64)a.dword[i] >> (b.dword[i] & 0x3f)) +\n                   (((s64)a.dword[i] >> ((b.dword[i] & 0x3f) - 1)) & 0x1);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsrar_d (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsrar.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift (with rounding) the signed 64-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if ((b.dword[i] &amp; 0x3f) == 0) {\n    dst.dword[i] = a.dword[i];\n  } else {\n    dst.dword[i] = ((s64)a.dword[i] &gt;&gt; (b.dword[i] &amp; 0x3f)) +\n                   (((s64)a.dword[i] &gt;&gt; ((b.dword[i] &amp; 0x3f) - 1)) &amp; 0x1);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsrar_h (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsrar_h (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvsrar.h xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nArithmetic right shift (with rounding) the signed 16-bit elements in `a` by elements in `b`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  if ((b.half[i] & 0xf) == 0) {\n    dst.half[i] = a.half[i];\n  } else {\n    dst.half[i] = ((s16)a.half[i] >> (b.half[i] & 0xf)) +\n                  (((s16)a.half[i] >> ((b.half[i] & 0xf) - 1)) & 0x1);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsrar_h (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsrar.h xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift (with rounding) the signed 16-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  if ((b.half[i] &amp; 0xf) == 0) {\n    dst.half[i] = a.half[i];\n  } else {\n    dst.half[i] = ((s16)a.half[i] &gt;&gt; (b.half[i] &amp; 0xf)) +\n                  (((s16)a.half[i] &gt;&gt; ((b.half[i] &amp; 0xf) - 1)) &amp; 0x1);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsrar_w (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsrar_w (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvsrar.w xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nArithmetic right shift (with rounding) the signed 32-bit elements in `a` by elements in `b`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  if ((b.word[i] & 0x1f) == 0) {\n    dst.word[i] = a.word[i];\n  } else {\n    dst.word[i] = ((s32)a.word[i] >> (b.word[i] & 0x1f)) +\n                  (((s32)a.word[i] >> ((b.word[i] & 0x1f) - 1)) & 0x1);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsrar_w (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsrar.w xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift (with rounding) the signed 32-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  if ((b.word[i] &amp; 0x1f) == 0) {\n    dst.word[i] = a.word[i];\n  } else {\n    dst.word[i] = ((s32)a.word[i] &gt;&gt; (b.word[i] &amp; 0x1f)) +\n                  (((s32)a.word[i] &gt;&gt; ((b.word[i] &amp; 0x1f) - 1)) &amp; 0x1);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsrari_b (__m256i a, imm0_7 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsrari_b (__m256i a, imm0_7 imm)\n#include <lasxintrin.h>\nInstruction: xvsrari.b xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nArithmetic right shift (with rounding) the signed 8-bit elements in `a` by `imm`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 32; i++) {\n  if (imm == 0) {\n    dst.byte[i] = a.byte[i];\n  } else {\n    dst.byte[i] = ((s8)a.byte[i] >> imm) + (((s8)a.byte[i] >> (imm - 1)) & 0x1);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsrari_b (__m256i a, imm0_7 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsrari.b xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift (with rounding) the signed 8-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 32; i++) {\n  if (imm == 0) {\n    dst.byte[i] = a.byte[i];\n  } else {\n    dst.byte[i] = ((s8)a.byte[i] &gt;&gt; imm) + (((s8)a.byte[i] &gt;&gt; (imm - 1)) &amp; 0x1);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsrari_d (__m256i a, imm0_63 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsrari_d (__m256i a, imm0_63 imm)\n#include <lasxintrin.h>\nInstruction: xvsrari.d xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nArithmetic right shift (with rounding) the signed 64-bit elements in `a` by `imm`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (imm == 0) {\n    dst.dword[i] = a.dword[i];\n  } else {\n    dst.dword[i] =\n        ((s64)a.dword[i] >> imm) + (((s64)a.dword[i] >> (imm - 1)) & 0x1);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsrari_d (__m256i a, imm0_63 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsrari.d xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift (with rounding) the signed 64-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (imm == 0) {\n    dst.dword[i] = a.dword[i];\n  } else {\n    dst.dword[i] =\n        ((s64)a.dword[i] &gt;&gt; imm) + (((s64)a.dword[i] &gt;&gt; (imm - 1)) &amp; 0x1);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsrari_h (__m256i a, imm0_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsrari_h (__m256i a, imm0_15 imm)\n#include <lasxintrin.h>\nInstruction: xvsrari.h xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nArithmetic right shift (with rounding) the signed 16-bit elements in `a` by `imm`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  if (imm == 0) {\n    dst.half[i] = a.half[i];\n  } else {\n    dst.half[i] =\n        ((s16)a.half[i] >> imm) + (((s16)a.half[i] >> (imm - 1)) & 0x1);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsrari_h (__m256i a, imm0_15 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsrari.h xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift (with rounding) the signed 16-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  if (imm == 0) {\n    dst.half[i] = a.half[i];\n  } else {\n    dst.half[i] =\n        ((s16)a.half[i] &gt;&gt; imm) + (((s16)a.half[i] &gt;&gt; (imm - 1)) &amp; 0x1);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsrari_w (__m256i a, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsrari_w (__m256i a, imm0_31 imm)\n#include <lasxintrin.h>\nInstruction: xvsrari.w xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nArithmetic right shift (with rounding) the signed 32-bit elements in `a` by `imm`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  if (imm == 0) {\n    dst.word[i] = a.word[i];\n  } else {\n    dst.word[i] =\n        ((s32)a.word[i] >> imm) + (((s32)a.word[i] >> (imm - 1)) & 0x1);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsrari_w (__m256i a, imm0_31 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsrari.w xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift (with rounding) the signed 32-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  if (imm == 0) {\n    dst.word[i] = a.word[i];\n  } else {\n    dst.word[i] =\n        ((s32)a.word[i] &gt;&gt; imm) + (((s32)a.word[i] &gt;&gt; (imm - 1)) &amp; 0x1);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsrarn_b_h (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsrarn_b_h (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvsrarn.b.h xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nArithmetic right shift (with rounding) the signed 16-bit elements in `a` by elements in `b`, truncate to 8-bit and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  if (i < 8) {\n    u8 shift = (b.half[i] & 15);\n    if (shift == 0) {\n      dst.byte[i] = (s8)(s16)a.half[i];\n    } else {\n      dst.byte[i] = (s8)(((s16)a.half[i] >> shift) +\n                         (((s16)a.half[i] >> (shift - 1)) & 0x1));\n    }\n  } else {\n    dst.byte[i] = 0;\n  }\n}\nfor (int i = 16; i < 32; i++) {\n  if (i < 24) {\n    u8 shift = (b.half[i - 8] & 15);\n    if (shift == 0) {\n      dst.byte[i] = (s8)(s16)a.half[i - 8];\n    } else {\n      dst.byte[i] = (s8)(((s16)a.half[i - 8] >> shift) +\n                         (((s16)a.half[i - 8] >> (shift - 1)) & 0x1));\n    }\n  } else {\n    dst.byte[i] = 0;\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsrarn_b_h (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsrarn.b.h xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift (with rounding) the signed 16-bit elements in <code>a</code> by elements in <code>b</code>, truncate to 8-bit and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  if (i &lt; 8) {\n    u8 shift = (b.half[i] &amp; 15);\n    if (shift == 0) {\n      dst.byte[i] = (s8)(s16)a.half[i];\n    } else {\n      dst.byte[i] = (s8)(((s16)a.half[i] &gt;&gt; shift) +\n                         (((s16)a.half[i] &gt;&gt; (shift - 1)) &amp; 0x1));\n    }\n  } else {\n    dst.byte[i] = 0;\n  }\n}\nfor (int i = 16; i &lt; 32; i++) {\n  if (i &lt; 24) {\n    u8 shift = (b.half[i - 8] &amp; 15);\n    if (shift == 0) {\n      dst.byte[i] = (s8)(s16)a.half[i - 8];\n    } else {\n      dst.byte[i] = (s8)(((s16)a.half[i - 8] &gt;&gt; shift) +\n                         (((s16)a.half[i - 8] &gt;&gt; (shift - 1)) &amp; 0x1));\n    }\n  } else {\n    dst.byte[i] = 0;\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsrarn_h_w (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsrarn_h_w (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvsrarn.h.w xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nArithmetic right shift (with rounding) the signed 32-bit elements in `a` by elements in `b`, truncate to 16-bit and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  if (i < 4) {\n    u8 shift = (b.word[i] & 31);\n    if (shift == 0) {\n      dst.half[i] = (s16)(s32)a.word[i];\n    } else {\n      dst.half[i] = (s16)(((s32)a.word[i] >> shift) +\n                          (((s32)a.word[i] >> (shift - 1)) & 0x1));\n    }\n  } else {\n    dst.half[i] = 0;\n  }\n}\nfor (int i = 8; i < 16; i++) {\n  if (i < 12) {\n    u8 shift = (b.word[i - 4] & 31);\n    if (shift == 0) {\n      dst.half[i] = (s16)(s32)a.word[i - 4];\n    } else {\n      dst.half[i] = (s16)(((s32)a.word[i - 4] >> shift) +\n                          (((s32)a.word[i - 4] >> (shift - 1)) & 0x1));\n    }\n  } else {\n    dst.half[i] = 0;\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsrarn_h_w (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsrarn.h.w xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift (with rounding) the signed 32-bit elements in <code>a</code> by elements in <code>b</code>, truncate to 16-bit and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  if (i &lt; 4) {\n    u8 shift = (b.word[i] &amp; 31);\n    if (shift == 0) {\n      dst.half[i] = (s16)(s32)a.word[i];\n    } else {\n      dst.half[i] = (s16)(((s32)a.word[i] &gt;&gt; shift) +\n                          (((s32)a.word[i] &gt;&gt; (shift - 1)) &amp; 0x1));\n    }\n  } else {\n    dst.half[i] = 0;\n  }\n}\nfor (int i = 8; i &lt; 16; i++) {\n  if (i &lt; 12) {\n    u8 shift = (b.word[i - 4] &amp; 31);\n    if (shift == 0) {\n      dst.half[i] = (s16)(s32)a.word[i - 4];\n    } else {\n      dst.half[i] = (s16)(((s32)a.word[i - 4] &gt;&gt; shift) +\n                          (((s32)a.word[i - 4] &gt;&gt; (shift - 1)) &amp; 0x1));\n    }\n  } else {\n    dst.half[i] = 0;\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsrarn_w_d (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsrarn_w_d (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvsrarn.w.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nArithmetic right shift (with rounding) the signed 64-bit elements in `a` by elements in `b`, truncate to 32-bit and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (i < 2) {\n    u8 shift = (b.dword[i] & 63);\n    if (shift == 0) {\n      dst.word[i] = (s32)(s64)a.dword[i];\n    } else {\n      dst.word[i] = (s32)(((s64)a.dword[i] >> shift) +\n                          (((s64)a.dword[i] >> (shift - 1)) & 0x1));\n    }\n  } else {\n    dst.word[i] = 0;\n  }\n}\nfor (int i = 4; i < 8; i++) {\n  if (i < 6) {\n    u8 shift = (b.dword[i - 2] & 63);\n    if (shift == 0) {\n      dst.word[i] = (s32)(s64)a.dword[i - 2];\n    } else {\n      dst.word[i] = (s32)(((s64)a.dword[i - 2] >> shift) +\n                          (((s64)a.dword[i - 2] >> (shift - 1)) & 0x1));\n    }\n  } else {\n    dst.word[i] = 0;\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsrarn_w_d (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsrarn.w.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift (with rounding) the signed 64-bit elements in <code>a</code> by elements in <code>b</code>, truncate to 32-bit and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (i &lt; 2) {\n    u8 shift = (b.dword[i] &amp; 63);\n    if (shift == 0) {\n      dst.word[i] = (s32)(s64)a.dword[i];\n    } else {\n      dst.word[i] = (s32)(((s64)a.dword[i] &gt;&gt; shift) +\n                          (((s64)a.dword[i] &gt;&gt; (shift - 1)) &amp; 0x1));\n    }\n  } else {\n    dst.word[i] = 0;\n  }\n}\nfor (int i = 4; i &lt; 8; i++) {\n  if (i &lt; 6) {\n    u8 shift = (b.dword[i - 2] &amp; 63);\n    if (shift == 0) {\n      dst.word[i] = (s32)(s64)a.dword[i - 2];\n    } else {\n      dst.word[i] = (s32)(((s64)a.dword[i - 2] &gt;&gt; shift) +\n                          (((s64)a.dword[i - 2] &gt;&gt; (shift - 1)) &amp; 0x1));\n    }\n  } else {\n    dst.word[i] = 0;\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsrarni_b_h (__m256i a, __m256i b, imm0_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsrarni_b_h (__m256i a, __m256i b, imm0_15 imm)\n#include <lasxintrin.h>\nInstruction: xvsrarni.b.h xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nArithmetic right shift (with rounding) the signed 16-bit elements in `a` and `b` by `imm`, truncate to 8-bit and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  if (i < 8) {\n    if (imm == 0) {\n      dst.byte[i] = (s8)(s16)b.half[i];\n    } else {\n      dst.byte[i] =\n          (s8)(((s16)b.half[i] >> imm) + (((s16)b.half[i] >> (imm - 1)) & 0x1));\n    }\n  } else {\n    if (imm == 0) {\n      dst.byte[i] = (s8)(s16)a.half[i - 8];\n    } else {\n      dst.byte[i] = (s8)(((s16)a.half[i - 8] >> imm) +\n                         (((s16)a.half[i - 8] >> (imm - 1)) & 0x1));\n    }\n  }\n}\nfor (int i = 16; i < 32; i++) {\n  if (i < 24) {\n    if (imm == 0) {\n      dst.byte[i] = (s8)(s16)b.half[i - 8];\n    } else {\n      dst.byte[i] = (s8)(((s16)b.half[i - 8] >> imm) +\n                         (((s16)b.half[i - 8] >> (imm - 1)) & 0x1));\n    }\n  } else {\n    if (imm == 0) {\n      dst.byte[i] = (s8)(s16)a.half[i - 16];\n    } else {\n      dst.byte[i] = (s8)(((s16)a.half[i - 16] >> imm) +\n                         (((s16)a.half[i - 16] >> (imm - 1)) & 0x1));\n    }\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsrarni_b_h (__m256i a, __m256i b, imm0_15 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsrarni.b.h xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift (with rounding) the signed 16-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, truncate to 8-bit and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  if (i &lt; 8) {\n    if (imm == 0) {\n      dst.byte[i] = (s8)(s16)b.half[i];\n    } else {\n      dst.byte[i] =\n          (s8)(((s16)b.half[i] &gt;&gt; imm) + (((s16)b.half[i] &gt;&gt; (imm - 1)) &amp; 0x1));\n    }\n  } else {\n    if (imm == 0) {\n      dst.byte[i] = (s8)(s16)a.half[i - 8];\n    } else {\n      dst.byte[i] = (s8)(((s16)a.half[i - 8] &gt;&gt; imm) +\n                         (((s16)a.half[i - 8] &gt;&gt; (imm - 1)) &amp; 0x1));\n    }\n  }\n}\nfor (int i = 16; i &lt; 32; i++) {\n  if (i &lt; 24) {\n    if (imm == 0) {\n      dst.byte[i] = (s8)(s16)b.half[i - 8];\n    } else {\n      dst.byte[i] = (s8)(((s16)b.half[i - 8] &gt;&gt; imm) +\n                         (((s16)b.half[i - 8] &gt;&gt; (imm - 1)) &amp; 0x1));\n    }\n  } else {\n    if (imm == 0) {\n      dst.byte[i] = (s8)(s16)a.half[i - 16];\n    } else {\n      dst.byte[i] = (s8)(((s16)a.half[i - 16] &gt;&gt; imm) +\n                         (((s16)a.half[i - 16] &gt;&gt; (imm - 1)) &amp; 0x1));\n    }\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsrarni_d_q (__m256i a, __m256i b, imm0_127 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsrarni_d_q (__m256i a, __m256i b, imm0_127 imm)\n#include <lasxintrin.h>\nInstruction: xvsrarni.d.q xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nArithmetic right shift (with rounding) the signed 128-bit elements in `a` and `b` by `imm`, truncate to 64-bit and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  if (i < 1) {\n    if (imm == 0) {\n      dst.dword[i] = (s64)(s128)b.qword[i];\n    } else {\n      dst.dword[i] = (s64)(((s128)b.qword[i] >> imm) +\n                           (((s128)b.qword[i] >> (imm - 1)) & 0x1));\n    }\n  } else {\n    if (imm == 0) {\n      dst.dword[i] = (s64)(s128)a.qword[i - 1];\n    } else {\n      dst.dword[i] = (s64)(((s128)a.qword[i - 1] >> imm) +\n                           (((s128)a.qword[i - 1] >> (imm - 1)) & 0x1));\n    }\n  }\n}\nfor (int i = 2; i < 4; i++) {\n  if (i < 3) {\n    if (imm == 0) {\n      dst.dword[i] = (s64)(s128)b.qword[i - 1];\n    } else {\n      dst.dword[i] = (s64)(((s128)b.qword[i - 1] >> imm) +\n                           (((s128)b.qword[i - 1] >> (imm - 1)) & 0x1));\n    }\n  } else {\n    if (imm == 0) {\n      dst.dword[i] = (s64)(s128)a.qword[i - 2];\n    } else {\n      dst.dword[i] = (s64)(((s128)a.qword[i - 2] >> imm) +\n                           (((s128)a.qword[i - 2] >> (imm - 1)) & 0x1));\n    }\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsrarni_d_q (__m256i a, __m256i b, imm0_127 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsrarni.d.q xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift (with rounding) the signed 128-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, truncate to 64-bit and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  if (i &lt; 1) {\n    if (imm == 0) {\n      dst.dword[i] = (s64)(s128)b.qword[i];\n    } else {\n      dst.dword[i] = (s64)(((s128)b.qword[i] &gt;&gt; imm) +\n                           (((s128)b.qword[i] &gt;&gt; (imm - 1)) &amp; 0x1));\n    }\n  } else {\n    if (imm == 0) {\n      dst.dword[i] = (s64)(s128)a.qword[i - 1];\n    } else {\n      dst.dword[i] = (s64)(((s128)a.qword[i - 1] &gt;&gt; imm) +\n                           (((s128)a.qword[i - 1] &gt;&gt; (imm - 1)) &amp; 0x1));\n    }\n  }\n}\nfor (int i = 2; i &lt; 4; i++) {\n  if (i &lt; 3) {\n    if (imm == 0) {\n      dst.dword[i] = (s64)(s128)b.qword[i - 1];\n    } else {\n      dst.dword[i] = (s64)(((s128)b.qword[i - 1] &gt;&gt; imm) +\n                           (((s128)b.qword[i - 1] &gt;&gt; (imm - 1)) &amp; 0x1));\n    }\n  } else {\n    if (imm == 0) {\n      dst.dword[i] = (s64)(s128)a.qword[i - 2];\n    } else {\n      dst.dword[i] = (s64)(((s128)a.qword[i - 2] &gt;&gt; imm) +\n                           (((s128)a.qword[i - 2] &gt;&gt; (imm - 1)) &amp; 0x1));\n    }\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsrarni_h_w (__m256i a, __m256i b, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsrarni_h_w (__m256i a, __m256i b, imm0_31 imm)\n#include <lasxintrin.h>\nInstruction: xvsrarni.h.w xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nArithmetic right shift (with rounding) the signed 32-bit elements in `a` and `b` by `imm`, truncate to 16-bit and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  if (i < 4) {\n    if (imm == 0) {\n      dst.half[i] = (s16)(s32)b.word[i];\n    } else {\n      dst.half[i] = (s16)(((s32)b.word[i] >> imm) +\n                          (((s32)b.word[i] >> (imm - 1)) & 0x1));\n    }\n  } else {\n    if (imm == 0) {\n      dst.half[i] = (s16)(s32)a.word[i - 4];\n    } else {\n      dst.half[i] = (s16)(((s32)a.word[i - 4] >> imm) +\n                          (((s32)a.word[i - 4] >> (imm - 1)) & 0x1));\n    }\n  }\n}\nfor (int i = 8; i < 16; i++) {\n  if (i < 12) {\n    if (imm == 0) {\n      dst.half[i] = (s16)(s32)b.word[i - 4];\n    } else {\n      dst.half[i] = (s16)(((s32)b.word[i - 4] >> imm) +\n                          (((s32)b.word[i - 4] >> (imm - 1)) & 0x1));\n    }\n  } else {\n    if (imm == 0) {\n      dst.half[i] = (s16)(s32)a.word[i - 8];\n    } else {\n      dst.half[i] = (s16)(((s32)a.word[i - 8] >> imm) +\n                          (((s32)a.word[i - 8] >> (imm - 1)) & 0x1));\n    }\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsrarni_h_w (__m256i a, __m256i b, imm0_31 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsrarni.h.w xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift (with rounding) the signed 32-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, truncate to 16-bit and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  if (i &lt; 4) {\n    if (imm == 0) {\n      dst.half[i] = (s16)(s32)b.word[i];\n    } else {\n      dst.half[i] = (s16)(((s32)b.word[i] &gt;&gt; imm) +\n                          (((s32)b.word[i] &gt;&gt; (imm - 1)) &amp; 0x1));\n    }\n  } else {\n    if (imm == 0) {\n      dst.half[i] = (s16)(s32)a.word[i - 4];\n    } else {\n      dst.half[i] = (s16)(((s32)a.word[i - 4] &gt;&gt; imm) +\n                          (((s32)a.word[i - 4] &gt;&gt; (imm - 1)) &amp; 0x1));\n    }\n  }\n}\nfor (int i = 8; i &lt; 16; i++) {\n  if (i &lt; 12) {\n    if (imm == 0) {\n      dst.half[i] = (s16)(s32)b.word[i - 4];\n    } else {\n      dst.half[i] = (s16)(((s32)b.word[i - 4] &gt;&gt; imm) +\n                          (((s32)b.word[i - 4] &gt;&gt; (imm - 1)) &amp; 0x1));\n    }\n  } else {\n    if (imm == 0) {\n      dst.half[i] = (s16)(s32)a.word[i - 8];\n    } else {\n      dst.half[i] = (s16)(((s32)a.word[i - 8] &gt;&gt; imm) +\n                          (((s32)a.word[i - 8] &gt;&gt; (imm - 1)) &amp; 0x1));\n    }\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsrarni_w_d (__m256i a, __m256i b, imm0_63 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsrarni_w_d (__m256i a, __m256i b, imm0_63 imm)\n#include <lasxintrin.h>\nInstruction: xvsrarni.w.d xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nArithmetic right shift (with rounding) the signed 64-bit elements in `a` and `b` by `imm`, truncate to 32-bit and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (i < 2) {\n    if (imm == 0) {\n      dst.word[i] = (s32)(s64)b.dword[i];\n    } else {\n      dst.word[i] = (s32)(((s64)b.dword[i] >> imm) +\n                          (((s64)b.dword[i] >> (imm - 1)) & 0x1));\n    }\n  } else {\n    if (imm == 0) {\n      dst.word[i] = (s32)(s64)a.dword[i - 2];\n    } else {\n      dst.word[i] = (s32)(((s64)a.dword[i - 2] >> imm) +\n                          (((s64)a.dword[i - 2] >> (imm - 1)) & 0x1));\n    }\n  }\n}\nfor (int i = 4; i < 8; i++) {\n  if (i < 6) {\n    if (imm == 0) {\n      dst.word[i] = (s32)(s64)b.dword[i - 2];\n    } else {\n      dst.word[i] = (s32)(((s64)b.dword[i - 2] >> imm) +\n                          (((s64)b.dword[i - 2] >> (imm - 1)) & 0x1));\n    }\n  } else {\n    if (imm == 0) {\n      dst.word[i] = (s32)(s64)a.dword[i - 4];\n    } else {\n      dst.word[i] = (s32)(((s64)a.dword[i - 4] >> imm) +\n                          (((s64)a.dword[i - 4] >> (imm - 1)) & 0x1));\n    }\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsrarni_w_d (__m256i a, __m256i b, imm0_63 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsrarni.w.d xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift (with rounding) the signed 64-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, truncate to 32-bit and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (i &lt; 2) {\n    if (imm == 0) {\n      dst.word[i] = (s32)(s64)b.dword[i];\n    } else {\n      dst.word[i] = (s32)(((s64)b.dword[i] &gt;&gt; imm) +\n                          (((s64)b.dword[i] &gt;&gt; (imm - 1)) &amp; 0x1));\n    }\n  } else {\n    if (imm == 0) {\n      dst.word[i] = (s32)(s64)a.dword[i - 2];\n    } else {\n      dst.word[i] = (s32)(((s64)a.dword[i - 2] &gt;&gt; imm) +\n                          (((s64)a.dword[i - 2] &gt;&gt; (imm - 1)) &amp; 0x1));\n    }\n  }\n}\nfor (int i = 4; i &lt; 8; i++) {\n  if (i &lt; 6) {\n    if (imm == 0) {\n      dst.word[i] = (s32)(s64)b.dword[i - 2];\n    } else {\n      dst.word[i] = (s32)(((s64)b.dword[i - 2] &gt;&gt; imm) +\n                          (((s64)b.dword[i - 2] &gt;&gt; (imm - 1)) &amp; 0x1));\n    }\n  } else {\n    if (imm == 0) {\n      dst.word[i] = (s32)(s64)a.dword[i - 4];\n    } else {\n      dst.word[i] = (s32)(((s64)a.dword[i - 4] &gt;&gt; imm) +\n                          (((s64)a.dword[i - 4] &gt;&gt; (imm - 1)) &amp; 0x1));\n    }\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsrl_b (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsrl_b (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvsrl.b xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nLogical right shift the unsigned 8-bit elements in `a` by elements in `b`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 32; i++) {\n  dst.byte[i] = a.byte[i] >> (b.byte[i] & 0x7);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsrl_b (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsrl.b xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift the unsigned 8-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 32; i++) {\n  dst.byte[i] = a.byte[i] &gt;&gt; (b.byte[i] &amp; 0x7);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsrl_d (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsrl_d (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvsrl.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nLogical right shift the unsigned 64-bit elements in `a` by elements in `b`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = a.dword[i] >> (b.dword[i] & 0x3f);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsrl_d (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsrl.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift the unsigned 64-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = a.dword[i] &gt;&gt; (b.dword[i] &amp; 0x3f);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsrl_h (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsrl_h (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvsrl.h xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nLogical right shift the unsigned 16-bit elements in `a` by elements in `b`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = a.half[i] >> (b.half[i] & 0xf);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsrl_h (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsrl.h xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift the unsigned 16-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = a.half[i] &gt;&gt; (b.half[i] &amp; 0xf);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsrl_w (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsrl_w (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvsrl.w xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nLogical right shift the unsigned 32-bit elements in `a` by elements in `b`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = a.word[i] >> (b.word[i] & 0x1f);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsrl_w (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsrl.w xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift the unsigned 32-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = a.word[i] &gt;&gt; (b.word[i] &amp; 0x1f);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsrli_b (__m256i a, imm0_7 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsrli_b (__m256i a, imm0_7 imm)\n#include <lasxintrin.h>\nInstruction: xvsrli.b xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nLogical right shift the unsigned 8-bit elements in `a` by `imm`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 32; i++) {\n  dst.byte[i] = a.byte[i] >> imm;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsrli_b (__m256i a, imm0_7 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsrli.b xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift the unsigned 8-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 32; i++) {\n  dst.byte[i] = a.byte[i] &gt;&gt; imm;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsrli_d (__m256i a, imm0_63 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsrli_d (__m256i a, imm0_63 imm)\n#include <lasxintrin.h>\nInstruction: xvsrli.d xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nLogical right shift the unsigned 64-bit elements in `a` by `imm`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = a.dword[i] >> imm;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsrli_d (__m256i a, imm0_63 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsrli.d xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift the unsigned 64-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = a.dword[i] &gt;&gt; imm;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsrli_h (__m256i a, imm0_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsrli_h (__m256i a, imm0_15 imm)\n#include <lasxintrin.h>\nInstruction: xvsrli.h xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nLogical right shift the unsigned 16-bit elements in `a` by `imm`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = a.half[i] >> imm;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsrli_h (__m256i a, imm0_15 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsrli.h xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift the unsigned 16-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = a.half[i] &gt;&gt; imm;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsrli_w (__m256i a, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsrli_w (__m256i a, imm0_31 imm)\n#include <lasxintrin.h>\nInstruction: xvsrli.w xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nLogical right shift the unsigned 32-bit elements in `a` by `imm`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = a.word[i] >> imm;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsrli_w (__m256i a, imm0_31 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsrli.w xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift the unsigned 32-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = a.word[i] &gt;&gt; imm;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsrln_b_h (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsrln_b_h (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvsrln.b.h xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nLogical right shift the unsigned 16-bit elements in `a` by elements in `b`, truncate to 8-bit and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] = (i < 8) ? (u8)((u16)a.half[i] >> (b.half[i] & 15)) : 0;\n}\nfor (int i = 16; i < 32; i++) {\n  dst.byte[i] = (i < 24) ? (u8)((u16)a.half[i - 8] >> (b.half[i - 8] & 15)) : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsrln_b_h (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsrln.b.h xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift the unsigned 16-bit elements in <code>a</code> by elements in <code>b</code>, truncate to 8-bit and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] = (i &lt; 8) ? (u8)((u16)a.half[i] &gt;&gt; (b.half[i] &amp; 15)) : 0;\n}\nfor (int i = 16; i &lt; 32; i++) {\n  dst.byte[i] = (i &lt; 24) ? (u8)((u16)a.half[i - 8] &gt;&gt; (b.half[i - 8] &amp; 15)) : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsrln_h_w (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsrln_h_w (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvsrln.h.w xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nLogical right shift the unsigned 32-bit elements in `a` by elements in `b`, truncate to 16-bit and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] = (i < 4) ? (u16)((u32)a.word[i] >> (b.word[i] & 31)) : 0;\n}\nfor (int i = 8; i < 16; i++) {\n  dst.half[i] =\n      (i < 12) ? (u16)((u32)a.word[i - 4] >> (b.word[i - 4] & 31)) : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsrln_h_w (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsrln.h.w xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift the unsigned 32-bit elements in <code>a</code> by elements in <code>b</code>, truncate to 16-bit and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] = (i &lt; 4) ? (u16)((u32)a.word[i] &gt;&gt; (b.word[i] &amp; 31)) : 0;\n}\nfor (int i = 8; i &lt; 16; i++) {\n  dst.half[i] =\n      (i &lt; 12) ? (u16)((u32)a.word[i - 4] &gt;&gt; (b.word[i - 4] &amp; 31)) : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsrln_w_d (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsrln_w_d (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvsrln.w.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nLogical right shift the unsigned 64-bit elements in `a` by elements in `b`, truncate to 32-bit and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = (i < 2) ? (u32)((u64)a.dword[i] >> (b.dword[i] & 63)) : 0;\n}\nfor (int i = 4; i < 8; i++) {\n  dst.word[i] =\n      (i < 6) ? (u32)((u64)a.dword[i - 2] >> (b.dword[i - 2] & 63)) : 0;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsrln_w_d (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsrln.w.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift the unsigned 64-bit elements in <code>a</code> by elements in <code>b</code>, truncate to 32-bit and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = (i &lt; 2) ? (u32)((u64)a.dword[i] &gt;&gt; (b.dword[i] &amp; 63)) : 0;\n}\nfor (int i = 4; i &lt; 8; i++) {\n  dst.word[i] =\n      (i &lt; 6) ? (u32)((u64)a.dword[i - 2] &gt;&gt; (b.dword[i - 2] &amp; 63)) : 0;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsrlni_b_h (__m256i a, __m256i b, imm0_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsrlni_b_h (__m256i a, __m256i b, imm0_15 imm)\n#include <lasxintrin.h>\nInstruction: xvsrlni.b.h xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nLogical right shift the unsigned 16-bit elements in `a` and `b` by `imm`, truncate to 8-bit and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.byte[i] =\n      (i < 8) ? (u8)((u16)b.half[i] >> imm) : (u8)((u16)a.half[i - 8] >> imm);\n}\nfor (int i = 16; i < 32; i++) {\n  dst.byte[i] = (i < 24) ? (u8)((u16)b.half[i - 8] >> imm)\n                         : (u8)((u16)a.half[i - 16] >> imm);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsrlni_b_h (__m256i a, __m256i b, imm0_15 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsrlni.b.h xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift the unsigned 16-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, truncate to 8-bit and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.byte[i] =\n      (i &lt; 8) ? (u8)((u16)b.half[i] &gt;&gt; imm) : (u8)((u16)a.half[i - 8] &gt;&gt; imm);\n}\nfor (int i = 16; i &lt; 32; i++) {\n  dst.byte[i] = (i &lt; 24) ? (u8)((u16)b.half[i - 8] &gt;&gt; imm)\n                         : (u8)((u16)a.half[i - 16] &gt;&gt; imm);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsrlni_d_q (__m256i a, __m256i b, imm0_127 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsrlni_d_q (__m256i a, __m256i b, imm0_127 imm)\n#include <lasxintrin.h>\nInstruction: xvsrlni.d.q xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nLogical right shift the unsigned 128-bit elements in `a` and `b` by `imm`, truncate to 64-bit and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.dword[i] = (i < 1) ? (u64)((u128)b.qword[i] >> imm)\n                         : (u64)((u128)a.qword[i - 1] >> imm);\n}\nfor (int i = 2; i < 4; i++) {\n  dst.dword[i] = (i < 3) ? (u64)((u128)b.qword[i - 1] >> imm)\n                         : (u64)((u128)a.qword[i - 2] >> imm);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsrlni_d_q (__m256i a, __m256i b, imm0_127 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsrlni.d.q xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift the unsigned 128-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, truncate to 64-bit and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.dword[i] = (i &lt; 1) ? (u64)((u128)b.qword[i] &gt;&gt; imm)\n                         : (u64)((u128)a.qword[i - 1] &gt;&gt; imm);\n}\nfor (int i = 2; i &lt; 4; i++) {\n  dst.dword[i] = (i &lt; 3) ? (u64)((u128)b.qword[i - 1] &gt;&gt; imm)\n                         : (u64)((u128)a.qword[i - 2] &gt;&gt; imm);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsrlni_h_w (__m256i a, __m256i b, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsrlni_h_w (__m256i a, __m256i b, imm0_31 imm)\n#include <lasxintrin.h>\nInstruction: xvsrlni.h.w xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nLogical right shift the unsigned 32-bit elements in `a` and `b` by `imm`, truncate to 16-bit and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.half[i] =\n      (i < 4) ? (u16)((u32)b.word[i] >> imm) : (u16)((u32)a.word[i - 4] >> imm);\n}\nfor (int i = 8; i < 16; i++) {\n  dst.half[i] = (i < 12) ? (u16)((u32)b.word[i - 4] >> imm)\n                         : (u16)((u32)a.word[i - 8] >> imm);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsrlni_h_w (__m256i a, __m256i b, imm0_31 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsrlni.h.w xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift the unsigned 32-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, truncate to 16-bit and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.half[i] =\n      (i &lt; 4) ? (u16)((u32)b.word[i] &gt;&gt; imm) : (u16)((u32)a.word[i - 4] &gt;&gt; imm);\n}\nfor (int i = 8; i &lt; 16; i++) {\n  dst.half[i] = (i &lt; 12) ? (u16)((u32)b.word[i - 4] &gt;&gt; imm)\n                         : (u16)((u32)a.word[i - 8] &gt;&gt; imm);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsrlni_w_d (__m256i a, __m256i b, imm0_63 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsrlni_w_d (__m256i a, __m256i b, imm0_63 imm)\n#include <lasxintrin.h>\nInstruction: xvsrlni.w.d xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nLogical right shift the unsigned 64-bit elements in `a` and `b` by `imm`, truncate to 32-bit and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.word[i] = (i < 2) ? (u32)((u64)b.dword[i] >> imm)\n                        : (u32)((u64)a.dword[i - 2] >> imm);\n}\nfor (int i = 4; i < 8; i++) {\n  dst.word[i] = (i < 6) ? (u32)((u64)b.dword[i - 2] >> imm)\n                        : (u32)((u64)a.dword[i - 4] >> imm);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsrlni_w_d (__m256i a, __m256i b, imm0_63 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsrlni.w.d xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift the unsigned 64-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, truncate to 32-bit and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.word[i] = (i &lt; 2) ? (u32)((u64)b.dword[i] &gt;&gt; imm)\n                        : (u32)((u64)a.dword[i - 2] &gt;&gt; imm);\n}\nfor (int i = 4; i &lt; 8; i++) {\n  dst.word[i] = (i &lt; 6) ? (u32)((u64)b.dword[i - 2] &gt;&gt; imm)\n                        : (u32)((u64)a.dword[i - 4] &gt;&gt; imm);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsrlr_b (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsrlr_b (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvsrlr.b xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nLogical right shift (with rounding) the unsigned 8-bit elements in `a` by elements in `b`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 32; i++) {\n  if ((b.byte[i] & 0x7) == 0) {\n    dst.byte[i] = a.byte[i];\n  } else {\n    dst.byte[i] = (a.byte[i] >> (b.byte[i] & 0x7)) +\n                  ((a.byte[i] >> ((b.byte[i] & 0x7) - 1)) & 0x1);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsrlr_b (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsrlr.b xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift (with rounding) the unsigned 8-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 32; i++) {\n  if ((b.byte[i] &amp; 0x7) == 0) {\n    dst.byte[i] = a.byte[i];\n  } else {\n    dst.byte[i] = (a.byte[i] &gt;&gt; (b.byte[i] &amp; 0x7)) +\n                  ((a.byte[i] &gt;&gt; ((b.byte[i] &amp; 0x7) - 1)) &amp; 0x1);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsrlr_d (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsrlr_d (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvsrlr.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nLogical right shift (with rounding) the unsigned 64-bit elements in `a` by elements in `b`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if ((b.dword[i] & 0x3f) == 0) {\n    dst.dword[i] = a.dword[i];\n  } else {\n    dst.dword[i] = (a.dword[i] >> (b.dword[i] & 0x3f)) +\n                   ((a.dword[i] >> ((b.dword[i] & 0x3f) - 1)) & 0x1);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsrlr_d (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsrlr.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift (with rounding) the unsigned 64-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if ((b.dword[i] &amp; 0x3f) == 0) {\n    dst.dword[i] = a.dword[i];\n  } else {\n    dst.dword[i] = (a.dword[i] &gt;&gt; (b.dword[i] &amp; 0x3f)) +\n                   ((a.dword[i] &gt;&gt; ((b.dword[i] &amp; 0x3f) - 1)) &amp; 0x1);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsrlr_h (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsrlr_h (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvsrlr.h xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nLogical right shift (with rounding) the unsigned 16-bit elements in `a` by elements in `b`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  if ((b.half[i] & 0xf) == 0) {\n    dst.half[i] = a.half[i];\n  } else {\n    dst.half[i] = (a.half[i] >> (b.half[i] & 0xf)) +\n                  ((a.half[i] >> ((b.half[i] & 0xf) - 1)) & 0x1);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsrlr_h (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsrlr.h xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift (with rounding) the unsigned 16-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  if ((b.half[i] &amp; 0xf) == 0) {\n    dst.half[i] = a.half[i];\n  } else {\n    dst.half[i] = (a.half[i] &gt;&gt; (b.half[i] &amp; 0xf)) +\n                  ((a.half[i] &gt;&gt; ((b.half[i] &amp; 0xf) - 1)) &amp; 0x1);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsrlr_w (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsrlr_w (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvsrlr.w xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nLogical right shift (with rounding) the unsigned 32-bit elements in `a` by elements in `b`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  if ((b.word[i] & 0x1f) == 0) {\n    dst.word[i] = a.word[i];\n  } else {\n    dst.word[i] = (a.word[i] >> (b.word[i] & 0x1f)) +\n                  ((a.word[i] >> ((b.word[i] & 0x1f) - 1)) & 0x1);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsrlr_w (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsrlr.w xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift (with rounding) the unsigned 32-bit elements in <code>a</code> by elements in <code>b</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  if ((b.word[i] &amp; 0x1f) == 0) {\n    dst.word[i] = a.word[i];\n  } else {\n    dst.word[i] = (a.word[i] &gt;&gt; (b.word[i] &amp; 0x1f)) +\n                  ((a.word[i] &gt;&gt; ((b.word[i] &amp; 0x1f) - 1)) &amp; 0x1);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsrlri_b (__m256i a, imm0_7 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsrlri_b (__m256i a, imm0_7 imm)\n#include <lasxintrin.h>\nInstruction: xvsrlri.b xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nLogical right shift (with rounding) the unsigned 8-bit elements in `a` by `imm`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 32; i++) {\n  if (imm == 0) {\n    dst.byte[i] = a.byte[i];\n  } else {\n    dst.byte[i] = (a.byte[i] >> imm) + ((a.byte[i] >> (imm - 1)) & 0x1);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsrlri_b (__m256i a, imm0_7 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsrlri.b xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift (with rounding) the unsigned 8-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 32; i++) {\n  if (imm == 0) {\n    dst.byte[i] = a.byte[i];\n  } else {\n    dst.byte[i] = (a.byte[i] &gt;&gt; imm) + ((a.byte[i] &gt;&gt; (imm - 1)) &amp; 0x1);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsrlri_d (__m256i a, imm0_63 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsrlri_d (__m256i a, imm0_63 imm)\n#include <lasxintrin.h>\nInstruction: xvsrlri.d xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nLogical right shift (with rounding) the unsigned 64-bit elements in `a` by `imm`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (imm == 0) {\n    dst.dword[i] = a.dword[i];\n  } else {\n    dst.dword[i] = (a.dword[i] >> imm) + ((a.dword[i] >> (imm - 1)) & 0x1);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsrlri_d (__m256i a, imm0_63 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsrlri.d xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift (with rounding) the unsigned 64-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (imm == 0) {\n    dst.dword[i] = a.dword[i];\n  } else {\n    dst.dword[i] = (a.dword[i] &gt;&gt; imm) + ((a.dword[i] &gt;&gt; (imm - 1)) &amp; 0x1);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsrlri_h (__m256i a, imm0_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsrlri_h (__m256i a, imm0_15 imm)\n#include <lasxintrin.h>\nInstruction: xvsrlri.h xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nLogical right shift (with rounding) the unsigned 16-bit elements in `a` by `imm`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  if (imm == 0) {\n    dst.half[i] = a.half[i];\n  } else {\n    dst.half[i] = (a.half[i] >> imm) + ((a.half[i] >> (imm - 1)) & 0x1);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsrlri_h (__m256i a, imm0_15 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsrlri.h xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift (with rounding) the unsigned 16-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  if (imm == 0) {\n    dst.half[i] = a.half[i];\n  } else {\n    dst.half[i] = (a.half[i] &gt;&gt; imm) + ((a.half[i] &gt;&gt; (imm - 1)) &amp; 0x1);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsrlri_w (__m256i a, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsrlri_w (__m256i a, imm0_31 imm)\n#include <lasxintrin.h>\nInstruction: xvsrlri.w xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nLogical right shift (with rounding) the unsigned 32-bit elements in `a` by `imm`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  if (imm == 0) {\n    dst.word[i] = a.word[i];\n  } else {\n    dst.word[i] = (a.word[i] >> imm) + ((a.word[i] >> (imm - 1)) & 0x1);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsrlri_w (__m256i a, imm0_31 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsrlri.w xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift (with rounding) the unsigned 32-bit elements in <code>a</code> by <code>imm</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  if (imm == 0) {\n    dst.word[i] = a.word[i];\n  } else {\n    dst.word[i] = (a.word[i] &gt;&gt; imm) + ((a.word[i] &gt;&gt; (imm - 1)) &amp; 0x1);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsrlrn_b_h (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsrlrn_b_h (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvsrlrn.b.h xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nLogical right shift (with rounding) the unsigned 16-bit elements in `a` by elements in `b`, truncate to 8-bit and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  if (i < 8) {\n    u8 shift = (b.half[i] & 15);\n    if (shift == 0) {\n      dst.byte[i] = (u8)(u16)a.half[i];\n    } else {\n      dst.byte[i] = (u8)(((u16)a.half[i] >> shift) +\n                         (((u16)a.half[i] >> (shift - 1)) & 0x1));\n    }\n  } else {\n    dst.byte[i] = 0;\n  }\n}\nfor (int i = 16; i < 32; i++) {\n  if (i < 24) {\n    u8 shift = (b.half[i - 8] & 15);\n    if (shift == 0) {\n      dst.byte[i] = (u8)(u16)a.half[i - 8];\n    } else {\n      dst.byte[i] = (u8)(((u16)a.half[i - 8] >> shift) +\n                         (((u16)a.half[i - 8] >> (shift - 1)) & 0x1));\n    }\n  } else {\n    dst.byte[i] = 0;\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsrlrn_b_h (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsrlrn.b.h xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift (with rounding) the unsigned 16-bit elements in <code>a</code> by elements in <code>b</code>, truncate to 8-bit and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  if (i &lt; 8) {\n    u8 shift = (b.half[i] &amp; 15);\n    if (shift == 0) {\n      dst.byte[i] = (u8)(u16)a.half[i];\n    } else {\n      dst.byte[i] = (u8)(((u16)a.half[i] &gt;&gt; shift) +\n                         (((u16)a.half[i] &gt;&gt; (shift - 1)) &amp; 0x1));\n    }\n  } else {\n    dst.byte[i] = 0;\n  }\n}\nfor (int i = 16; i &lt; 32; i++) {\n  if (i &lt; 24) {\n    u8 shift = (b.half[i - 8] &amp; 15);\n    if (shift == 0) {\n      dst.byte[i] = (u8)(u16)a.half[i - 8];\n    } else {\n      dst.byte[i] = (u8)(((u16)a.half[i - 8] &gt;&gt; shift) +\n                         (((u16)a.half[i - 8] &gt;&gt; (shift - 1)) &amp; 0x1));\n    }\n  } else {\n    dst.byte[i] = 0;\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsrlrn_h_w (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsrlrn_h_w (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvsrlrn.h.w xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nLogical right shift (with rounding) the unsigned 32-bit elements in `a` by elements in `b`, truncate to 16-bit and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  if (i < 4) {\n    u8 shift = (b.word[i] & 31);\n    if (shift == 0) {\n      dst.half[i] = (u16)(u32)a.word[i];\n    } else {\n      dst.half[i] = (u16)(((u32)a.word[i] >> shift) +\n                          (((u32)a.word[i] >> (shift - 1)) & 0x1));\n    }\n  } else {\n    dst.half[i] = 0;\n  }\n}\nfor (int i = 8; i < 16; i++) {\n  if (i < 12) {\n    u8 shift = (b.word[i - 4] & 31);\n    if (shift == 0) {\n      dst.half[i] = (u16)(u32)a.word[i - 4];\n    } else {\n      dst.half[i] = (u16)(((u32)a.word[i - 4] >> shift) +\n                          (((u32)a.word[i - 4] >> (shift - 1)) & 0x1));\n    }\n  } else {\n    dst.half[i] = 0;\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsrlrn_h_w (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsrlrn.h.w xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift (with rounding) the unsigned 32-bit elements in <code>a</code> by elements in <code>b</code>, truncate to 16-bit and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  if (i &lt; 4) {\n    u8 shift = (b.word[i] &amp; 31);\n    if (shift == 0) {\n      dst.half[i] = (u16)(u32)a.word[i];\n    } else {\n      dst.half[i] = (u16)(((u32)a.word[i] &gt;&gt; shift) +\n                          (((u32)a.word[i] &gt;&gt; (shift - 1)) &amp; 0x1));\n    }\n  } else {\n    dst.half[i] = 0;\n  }\n}\nfor (int i = 8; i &lt; 16; i++) {\n  if (i &lt; 12) {\n    u8 shift = (b.word[i - 4] &amp; 31);\n    if (shift == 0) {\n      dst.half[i] = (u16)(u32)a.word[i - 4];\n    } else {\n      dst.half[i] = (u16)(((u32)a.word[i - 4] &gt;&gt; shift) +\n                          (((u32)a.word[i - 4] &gt;&gt; (shift - 1)) &amp; 0x1));\n    }\n  } else {\n    dst.half[i] = 0;\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsrlrn_w_d (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsrlrn_w_d (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvsrlrn.w.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nLogical right shift (with rounding) the unsigned 64-bit elements in `a` by elements in `b`, truncate to 32-bit and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (i < 2) {\n    u8 shift = (b.dword[i] & 63);\n    if (shift == 0) {\n      dst.word[i] = (u32)(u64)a.dword[i];\n    } else {\n      dst.word[i] = (u32)(((u64)a.dword[i] >> shift) +\n                          (((u64)a.dword[i] >> (shift - 1)) & 0x1));\n    }\n  } else {\n    dst.word[i] = 0;\n  }\n}\nfor (int i = 4; i < 8; i++) {\n  if (i < 6) {\n    u8 shift = (b.dword[i - 2] & 63);\n    if (shift == 0) {\n      dst.word[i] = (u32)(u64)a.dword[i - 2];\n    } else {\n      dst.word[i] = (u32)(((u64)a.dword[i - 2] >> shift) +\n                          (((u64)a.dword[i - 2] >> (shift - 1)) & 0x1));\n    }\n  } else {\n    dst.word[i] = 0;\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsrlrn_w_d (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsrlrn.w.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift (with rounding) the unsigned 64-bit elements in <code>a</code> by elements in <code>b</code>, truncate to 32-bit and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (i &lt; 2) {\n    u8 shift = (b.dword[i] &amp; 63);\n    if (shift == 0) {\n      dst.word[i] = (u32)(u64)a.dword[i];\n    } else {\n      dst.word[i] = (u32)(((u64)a.dword[i] &gt;&gt; shift) +\n                          (((u64)a.dword[i] &gt;&gt; (shift - 1)) &amp; 0x1));\n    }\n  } else {\n    dst.word[i] = 0;\n  }\n}\nfor (int i = 4; i &lt; 8; i++) {\n  if (i &lt; 6) {\n    u8 shift = (b.dword[i - 2] &amp; 63);\n    if (shift == 0) {\n      dst.word[i] = (u32)(u64)a.dword[i - 2];\n    } else {\n      dst.word[i] = (u32)(((u64)a.dword[i - 2] &gt;&gt; shift) +\n                          (((u64)a.dword[i - 2] &gt;&gt; (shift - 1)) &amp; 0x1));\n    }\n  } else {\n    dst.word[i] = 0;\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsrlrni_b_h (__m256i a, __m256i b, imm0_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsrlrni_b_h (__m256i a, __m256i b, imm0_15 imm)\n#include <lasxintrin.h>\nInstruction: xvsrlrni.b.h xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nLogical right shift (with rounding) the unsigned 16-bit elements in `a` and `b` by `imm`, truncate to 8-bit and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  if (i < 8) {\n    if (imm == 0) {\n      dst.byte[i] = (u8)(u16)b.half[i];\n    } else {\n      dst.byte[i] =\n          (u8)(((u16)b.half[i] >> imm) + (((u16)b.half[i] >> (imm - 1)) & 0x1));\n    }\n  } else {\n    if (imm == 0) {\n      dst.byte[i] = (u8)(u16)a.half[i - 8];\n    } else {\n      dst.byte[i] = (u8)(((u16)a.half[i - 8] >> imm) +\n                         (((u16)a.half[i - 8] >> (imm - 1)) & 0x1));\n    }\n  }\n}\nfor (int i = 16; i < 32; i++) {\n  if (i < 24) {\n    if (imm == 0) {\n      dst.byte[i] = (u8)(u16)b.half[i - 8];\n    } else {\n      dst.byte[i] = (u8)(((u16)b.half[i - 8] >> imm) +\n                         (((u16)b.half[i - 8] >> (imm - 1)) & 0x1));\n    }\n  } else {\n    if (imm == 0) {\n      dst.byte[i] = (u8)(u16)a.half[i - 16];\n    } else {\n      dst.byte[i] = (u8)(((u16)a.half[i - 16] >> imm) +\n                         (((u16)a.half[i - 16] >> (imm - 1)) & 0x1));\n    }\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsrlrni_b_h (__m256i a, __m256i b, imm0_15 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsrlrni.b.h xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift (with rounding) the unsigned 16-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, truncate to 8-bit and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  if (i &lt; 8) {\n    if (imm == 0) {\n      dst.byte[i] = (u8)(u16)b.half[i];\n    } else {\n      dst.byte[i] =\n          (u8)(((u16)b.half[i] &gt;&gt; imm) + (((u16)b.half[i] &gt;&gt; (imm - 1)) &amp; 0x1));\n    }\n  } else {\n    if (imm == 0) {\n      dst.byte[i] = (u8)(u16)a.half[i - 8];\n    } else {\n      dst.byte[i] = (u8)(((u16)a.half[i - 8] &gt;&gt; imm) +\n                         (((u16)a.half[i - 8] &gt;&gt; (imm - 1)) &amp; 0x1));\n    }\n  }\n}\nfor (int i = 16; i &lt; 32; i++) {\n  if (i &lt; 24) {\n    if (imm == 0) {\n      dst.byte[i] = (u8)(u16)b.half[i - 8];\n    } else {\n      dst.byte[i] = (u8)(((u16)b.half[i - 8] &gt;&gt; imm) +\n                         (((u16)b.half[i - 8] &gt;&gt; (imm - 1)) &amp; 0x1));\n    }\n  } else {\n    if (imm == 0) {\n      dst.byte[i] = (u8)(u16)a.half[i - 16];\n    } else {\n      dst.byte[i] = (u8)(((u16)a.half[i - 16] &gt;&gt; imm) +\n                         (((u16)a.half[i - 16] &gt;&gt; (imm - 1)) &amp; 0x1));\n    }\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsrlrni_d_q (__m256i a, __m256i b, imm0_127 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsrlrni_d_q (__m256i a, __m256i b, imm0_127 imm)\n#include <lasxintrin.h>\nInstruction: xvsrlrni.d.q xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nLogical right shift (with rounding) the unsigned 128-bit elements in `a` and `b` by `imm`, truncate to 64-bit and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  if (i < 1) {\n    if (imm == 0) {\n      dst.dword[i] = (u64)(u128)b.qword[i];\n    } else {\n      dst.dword[i] = (u64)(((u128)b.qword[i] >> imm) +\n                           (((u128)b.qword[i] >> (imm - 1)) & 0x1));\n    }\n  } else {\n    if (imm == 0) {\n      dst.dword[i] = (u64)(u128)a.qword[i - 1];\n    } else {\n      dst.dword[i] = (u64)(((u128)a.qword[i - 1] >> imm) +\n                           (((u128)a.qword[i - 1] >> (imm - 1)) & 0x1));\n    }\n  }\n}\nfor (int i = 2; i < 4; i++) {\n  if (i < 3) {\n    if (imm == 0) {\n      dst.dword[i] = (u64)(u128)b.qword[i - 1];\n    } else {\n      dst.dword[i] = (u64)(((u128)b.qword[i - 1] >> imm) +\n                           (((u128)b.qword[i - 1] >> (imm - 1)) & 0x1));\n    }\n  } else {\n    if (imm == 0) {\n      dst.dword[i] = (u64)(u128)a.qword[i - 2];\n    } else {\n      dst.dword[i] = (u64)(((u128)a.qword[i - 2] >> imm) +\n                           (((u128)a.qword[i - 2] >> (imm - 1)) & 0x1));\n    }\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsrlrni_d_q (__m256i a, __m256i b, imm0_127 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsrlrni.d.q xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift (with rounding) the unsigned 128-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, truncate to 64-bit and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  if (i &lt; 1) {\n    if (imm == 0) {\n      dst.dword[i] = (u64)(u128)b.qword[i];\n    } else {\n      dst.dword[i] = (u64)(((u128)b.qword[i] &gt;&gt; imm) +\n                           (((u128)b.qword[i] &gt;&gt; (imm - 1)) &amp; 0x1));\n    }\n  } else {\n    if (imm == 0) {\n      dst.dword[i] = (u64)(u128)a.qword[i - 1];\n    } else {\n      dst.dword[i] = (u64)(((u128)a.qword[i - 1] &gt;&gt; imm) +\n                           (((u128)a.qword[i - 1] &gt;&gt; (imm - 1)) &amp; 0x1));\n    }\n  }\n}\nfor (int i = 2; i &lt; 4; i++) {\n  if (i &lt; 3) {\n    if (imm == 0) {\n      dst.dword[i] = (u64)(u128)b.qword[i - 1];\n    } else {\n      dst.dword[i] = (u64)(((u128)b.qword[i - 1] &gt;&gt; imm) +\n                           (((u128)b.qword[i - 1] &gt;&gt; (imm - 1)) &amp; 0x1));\n    }\n  } else {\n    if (imm == 0) {\n      dst.dword[i] = (u64)(u128)a.qword[i - 2];\n    } else {\n      dst.dword[i] = (u64)(((u128)a.qword[i - 2] &gt;&gt; imm) +\n                           (((u128)a.qword[i - 2] &gt;&gt; (imm - 1)) &amp; 0x1));\n    }\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsrlrni_h_w (__m256i a, __m256i b, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsrlrni_h_w (__m256i a, __m256i b, imm0_31 imm)\n#include <lasxintrin.h>\nInstruction: xvsrlrni.h.w xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nLogical right shift (with rounding) the unsigned 32-bit elements in `a` and `b` by `imm`, truncate to 16-bit and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  if (i < 4) {\n    if (imm == 0) {\n      dst.half[i] = (u16)(u32)b.word[i];\n    } else {\n      dst.half[i] = (u16)(((u32)b.word[i] >> imm) +\n                          (((u32)b.word[i] >> (imm - 1)) & 0x1));\n    }\n  } else {\n    if (imm == 0) {\n      dst.half[i] = (u16)(u32)a.word[i - 4];\n    } else {\n      dst.half[i] = (u16)(((u32)a.word[i - 4] >> imm) +\n                          (((u32)a.word[i - 4] >> (imm - 1)) & 0x1));\n    }\n  }\n}\nfor (int i = 8; i < 16; i++) {\n  if (i < 12) {\n    if (imm == 0) {\n      dst.half[i] = (u16)(u32)b.word[i - 4];\n    } else {\n      dst.half[i] = (u16)(((u32)b.word[i - 4] >> imm) +\n                          (((u32)b.word[i - 4] >> (imm - 1)) & 0x1));\n    }\n  } else {\n    if (imm == 0) {\n      dst.half[i] = (u16)(u32)a.word[i - 8];\n    } else {\n      dst.half[i] = (u16)(((u32)a.word[i - 8] >> imm) +\n                          (((u32)a.word[i - 8] >> (imm - 1)) & 0x1));\n    }\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsrlrni_h_w (__m256i a, __m256i b, imm0_31 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsrlrni.h.w xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift (with rounding) the unsigned 32-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, truncate to 16-bit and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  if (i &lt; 4) {\n    if (imm == 0) {\n      dst.half[i] = (u16)(u32)b.word[i];\n    } else {\n      dst.half[i] = (u16)(((u32)b.word[i] &gt;&gt; imm) +\n                          (((u32)b.word[i] &gt;&gt; (imm - 1)) &amp; 0x1));\n    }\n  } else {\n    if (imm == 0) {\n      dst.half[i] = (u16)(u32)a.word[i - 4];\n    } else {\n      dst.half[i] = (u16)(((u32)a.word[i - 4] &gt;&gt; imm) +\n                          (((u32)a.word[i - 4] &gt;&gt; (imm - 1)) &amp; 0x1));\n    }\n  }\n}\nfor (int i = 8; i &lt; 16; i++) {\n  if (i &lt; 12) {\n    if (imm == 0) {\n      dst.half[i] = (u16)(u32)b.word[i - 4];\n    } else {\n      dst.half[i] = (u16)(((u32)b.word[i - 4] &gt;&gt; imm) +\n                          (((u32)b.word[i - 4] &gt;&gt; (imm - 1)) &amp; 0x1));\n    }\n  } else {\n    if (imm == 0) {\n      dst.half[i] = (u16)(u32)a.word[i - 8];\n    } else {\n      dst.half[i] = (u16)(((u32)a.word[i - 8] &gt;&gt; imm) +\n                          (((u32)a.word[i - 8] &gt;&gt; (imm - 1)) &amp; 0x1));\n    }\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsrlrni_w_d (__m256i a, __m256i b, imm0_63 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsrlrni_w_d (__m256i a, __m256i b, imm0_63 imm)\n#include <lasxintrin.h>\nInstruction: xvsrlrni.w.d xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nLogical right shift (with rounding) the unsigned 64-bit elements in `a` and `b` by `imm`, truncate to 32-bit and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (i < 2) {\n    if (imm == 0) {\n      dst.word[i] = (u32)(u64)b.dword[i];\n    } else {\n      dst.word[i] = (u32)(((u64)b.dword[i] >> imm) +\n                          (((u64)b.dword[i] >> (imm - 1)) & 0x1));\n    }\n  } else {\n    if (imm == 0) {\n      dst.word[i] = (u32)(u64)a.dword[i - 2];\n    } else {\n      dst.word[i] = (u32)(((u64)a.dword[i - 2] >> imm) +\n                          (((u64)a.dword[i - 2] >> (imm - 1)) & 0x1));\n    }\n  }\n}\nfor (int i = 4; i < 8; i++) {\n  if (i < 6) {\n    if (imm == 0) {\n      dst.word[i] = (u32)(u64)b.dword[i - 2];\n    } else {\n      dst.word[i] = (u32)(((u64)b.dword[i - 2] >> imm) +\n                          (((u64)b.dword[i - 2] >> (imm - 1)) & 0x1));\n    }\n  } else {\n    if (imm == 0) {\n      dst.word[i] = (u32)(u64)a.dword[i - 4];\n    } else {\n      dst.word[i] = (u32)(((u64)a.dword[i - 4] >> imm) +\n                          (((u64)a.dword[i - 4] >> (imm - 1)) & 0x1));\n    }\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsrlrni_w_d (__m256i a, __m256i b, imm0_63 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsrlrni.w.d xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift (with rounding) the unsigned 64-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, truncate to 32-bit and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (i &lt; 2) {\n    if (imm == 0) {\n      dst.word[i] = (u32)(u64)b.dword[i];\n    } else {\n      dst.word[i] = (u32)(((u64)b.dword[i] &gt;&gt; imm) +\n                          (((u64)b.dword[i] &gt;&gt; (imm - 1)) &amp; 0x1));\n    }\n  } else {\n    if (imm == 0) {\n      dst.word[i] = (u32)(u64)a.dword[i - 2];\n    } else {\n      dst.word[i] = (u32)(((u64)a.dword[i - 2] &gt;&gt; imm) +\n                          (((u64)a.dword[i - 2] &gt;&gt; (imm - 1)) &amp; 0x1));\n    }\n  }\n}\nfor (int i = 4; i &lt; 8; i++) {\n  if (i &lt; 6) {\n    if (imm == 0) {\n      dst.word[i] = (u32)(u64)b.dword[i - 2];\n    } else {\n      dst.word[i] = (u32)(((u64)b.dword[i - 2] &gt;&gt; imm) +\n                          (((u64)b.dword[i - 2] &gt;&gt; (imm - 1)) &amp; 0x1));\n    }\n  } else {\n    if (imm == 0) {\n      dst.word[i] = (u32)(u64)a.dword[i - 4];\n    } else {\n      dst.word[i] = (u32)(((u64)a.dword[i - 4] &gt;&gt; imm) +\n                          (((u64)a.dword[i - 4] &gt;&gt; (imm - 1)) &amp; 0x1));\n    }\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvssran_b_h (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvssran_b_h (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvssran.b.h xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nArithmetic right shift the signed 16-bit elements in `a` by elements in `b`, clamp to fit in signed 8-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  if (i < 8) {\n    s16 temp = (s16)a.half[i] >> (b.half[i] & 15);\n    dst.byte[i] = clamp<s16>(temp, -128, 127);\n  } else {\n    dst.byte[i] = 0;\n  }\n}\nfor (int i = 16; i < 32; i++) {\n  if (i < 24) {\n    s16 temp = (s16)a.half[i - 8] >> (b.half[i - 8] & 15);\n    dst.byte[i] = clamp<s16>(temp, -128, 127);\n  } else {\n    dst.byte[i] = 0;\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvssran_b_h (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvssran.b.h xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift the signed 16-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in signed 8-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  if (i &lt; 8) {\n    s16 temp = (s16)a.half[i] &gt;&gt; (b.half[i] &amp; 15);\n    dst.byte[i] = clamp&lt;s16&gt;(temp, -128, 127);\n  } else {\n    dst.byte[i] = 0;\n  }\n}\nfor (int i = 16; i &lt; 32; i++) {\n  if (i &lt; 24) {\n    s16 temp = (s16)a.half[i - 8] &gt;&gt; (b.half[i - 8] &amp; 15);\n    dst.byte[i] = clamp&lt;s16&gt;(temp, -128, 127);\n  } else {\n    dst.byte[i] = 0;\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvssran_bu_h (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvssran_bu_h (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvssran.bu.h xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nArithmetic right shift the signed 16-bit elements in `a` by elements in `b`, clamp to fit in unsigned 8-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  if (i < 8) {\n    s16 temp = (s16)a.half[i] >> (b.half[i] & 15);\n    dst.byte[i] = clamp<s16>(temp, 0, 255);\n  } else {\n    dst.byte[i] = 0;\n  }\n}\nfor (int i = 16; i < 32; i++) {\n  if (i < 24) {\n    s16 temp = (s16)a.half[i - 8] >> (b.half[i - 8] & 15);\n    dst.byte[i] = clamp<s16>(temp, 0, 255);\n  } else {\n    dst.byte[i] = 0;\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvssran_bu_h (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvssran.bu.h xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift the signed 16-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in unsigned 8-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  if (i &lt; 8) {\n    s16 temp = (s16)a.half[i] &gt;&gt; (b.half[i] &amp; 15);\n    dst.byte[i] = clamp&lt;s16&gt;(temp, 0, 255);\n  } else {\n    dst.byte[i] = 0;\n  }\n}\nfor (int i = 16; i &lt; 32; i++) {\n  if (i &lt; 24) {\n    s16 temp = (s16)a.half[i - 8] &gt;&gt; (b.half[i - 8] &amp; 15);\n    dst.byte[i] = clamp&lt;s16&gt;(temp, 0, 255);\n  } else {\n    dst.byte[i] = 0;\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvssran_h_w (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvssran_h_w (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvssran.h.w xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nArithmetic right shift the signed 32-bit elements in `a` by elements in `b`, clamp to fit in signed 16-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  if (i < 4) {\n    s32 temp = (s32)a.word[i] >> (b.word[i] & 31);\n    dst.half[i] = clamp<s32>(temp, -32768, 32767);\n  } else {\n    dst.half[i] = 0;\n  }\n}\nfor (int i = 8; i < 16; i++) {\n  if (i < 12) {\n    s32 temp = (s32)a.word[i - 4] >> (b.word[i - 4] & 31);\n    dst.half[i] = clamp<s32>(temp, -32768, 32767);\n  } else {\n    dst.half[i] = 0;\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvssran_h_w (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvssran.h.w xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift the signed 32-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in signed 16-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  if (i &lt; 4) {\n    s32 temp = (s32)a.word[i] &gt;&gt; (b.word[i] &amp; 31);\n    dst.half[i] = clamp&lt;s32&gt;(temp, -32768, 32767);\n  } else {\n    dst.half[i] = 0;\n  }\n}\nfor (int i = 8; i &lt; 16; i++) {\n  if (i &lt; 12) {\n    s32 temp = (s32)a.word[i - 4] &gt;&gt; (b.word[i - 4] &amp; 31);\n    dst.half[i] = clamp&lt;s32&gt;(temp, -32768, 32767);\n  } else {\n    dst.half[i] = 0;\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvssran_hu_w (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvssran_hu_w (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvssran.hu.w xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nArithmetic right shift the signed 32-bit elements in `a` by elements in `b`, clamp to fit in unsigned 16-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  if (i < 4) {\n    s32 temp = (s32)a.word[i] >> (b.word[i] & 31);\n    dst.half[i] = clamp<s32>(temp, 0, 65535);\n  } else {\n    dst.half[i] = 0;\n  }\n}\nfor (int i = 8; i < 16; i++) {\n  if (i < 12) {\n    s32 temp = (s32)a.word[i - 4] >> (b.word[i - 4] & 31);\n    dst.half[i] = clamp<s32>(temp, 0, 65535);\n  } else {\n    dst.half[i] = 0;\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvssran_hu_w (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvssran.hu.w xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift the signed 32-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in unsigned 16-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  if (i &lt; 4) {\n    s32 temp = (s32)a.word[i] &gt;&gt; (b.word[i] &amp; 31);\n    dst.half[i] = clamp&lt;s32&gt;(temp, 0, 65535);\n  } else {\n    dst.half[i] = 0;\n  }\n}\nfor (int i = 8; i &lt; 16; i++) {\n  if (i &lt; 12) {\n    s32 temp = (s32)a.word[i - 4] &gt;&gt; (b.word[i - 4] &amp; 31);\n    dst.half[i] = clamp&lt;s32&gt;(temp, 0, 65535);\n  } else {\n    dst.half[i] = 0;\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvssran_w_d (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvssran_w_d (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvssran.w.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nArithmetic right shift the signed 64-bit elements in `a` by elements in `b`, clamp to fit in signed 32-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (i < 2) {\n    s64 temp = (s64)a.dword[i] >> (b.dword[i] & 63);\n    dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);\n  } else {\n    dst.word[i] = 0;\n  }\n}\nfor (int i = 4; i < 8; i++) {\n  if (i < 6) {\n    s64 temp = (s64)a.dword[i - 2] >> (b.dword[i - 2] & 63);\n    dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);\n  } else {\n    dst.word[i] = 0;\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvssran_w_d (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvssran.w.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift the signed 64-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in signed 32-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (i &lt; 2) {\n    s64 temp = (s64)a.dword[i] &gt;&gt; (b.dword[i] &amp; 63);\n    dst.word[i] = clamp&lt;s64&gt;(temp, -2147483648, 2147483647);\n  } else {\n    dst.word[i] = 0;\n  }\n}\nfor (int i = 4; i &lt; 8; i++) {\n  if (i &lt; 6) {\n    s64 temp = (s64)a.dword[i - 2] &gt;&gt; (b.dword[i - 2] &amp; 63);\n    dst.word[i] = clamp&lt;s64&gt;(temp, -2147483648, 2147483647);\n  } else {\n    dst.word[i] = 0;\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvssran_wu_d (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvssran_wu_d (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvssran.wu.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nArithmetic right shift the signed 64-bit elements in `a` by elements in `b`, clamp to fit in unsigned 32-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (i < 2) {\n    s64 temp = (s64)a.dword[i] >> (b.dword[i] & 63);\n    dst.word[i] = clamp<s64>(temp, 0, 4294967295);\n  } else {\n    dst.word[i] = 0;\n  }\n}\nfor (int i = 4; i < 8; i++) {\n  if (i < 6) {\n    s64 temp = (s64)a.dword[i - 2] >> (b.dword[i - 2] & 63);\n    dst.word[i] = clamp<s64>(temp, 0, 4294967295);\n  } else {\n    dst.word[i] = 0;\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvssran_wu_d (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvssran.wu.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift the signed 64-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in unsigned 32-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (i &lt; 2) {\n    s64 temp = (s64)a.dword[i] &gt;&gt; (b.dword[i] &amp; 63);\n    dst.word[i] = clamp&lt;s64&gt;(temp, 0, 4294967295);\n  } else {\n    dst.word[i] = 0;\n  }\n}\nfor (int i = 4; i &lt; 8; i++) {\n  if (i &lt; 6) {\n    s64 temp = (s64)a.dword[i - 2] &gt;&gt; (b.dword[i - 2] &amp; 63);\n    dst.word[i] = clamp&lt;s64&gt;(temp, 0, 4294967295);\n  } else {\n    dst.word[i] = 0;\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvssrani_b_h (__m256i a, __m256i b, imm0_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvssrani_b_h (__m256i a, __m256i b, imm0_15 imm)\n#include <lasxintrin.h>\nInstruction: xvssrani.b.h xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nArithmetic right shift the signed 16-bit elements in `a` and `b` by `imm`, clamp to fit in signed 8-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  if (i < 8) {\n    s16 temp = (s16)b.half[i] >> imm;\n    dst.byte[i] = clamp<s16>(temp, -128, 127);\n  } else {\n    s16 temp = (s16)a.half[i - 8] >> imm;\n    dst.byte[i] = clamp<s16>(temp, -128, 127);\n  }\n}\nfor (int i = 16; i < 32; i++) {\n  if (i < 24) {\n    s16 temp = (s16)b.half[i - 8] >> imm;\n    dst.byte[i] = clamp<s16>(temp, -128, 127);\n  } else {\n    s16 temp = (s16)a.half[i - 16] >> imm;\n    dst.byte[i] = clamp<s16>(temp, -128, 127);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvssrani_b_h (__m256i a, __m256i b, imm0_15 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvssrani.b.h xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift the signed 16-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in signed 8-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  if (i &lt; 8) {\n    s16 temp = (s16)b.half[i] &gt;&gt; imm;\n    dst.byte[i] = clamp&lt;s16&gt;(temp, -128, 127);\n  } else {\n    s16 temp = (s16)a.half[i - 8] &gt;&gt; imm;\n    dst.byte[i] = clamp&lt;s16&gt;(temp, -128, 127);\n  }\n}\nfor (int i = 16; i &lt; 32; i++) {\n  if (i &lt; 24) {\n    s16 temp = (s16)b.half[i - 8] &gt;&gt; imm;\n    dst.byte[i] = clamp&lt;s16&gt;(temp, -128, 127);\n  } else {\n    s16 temp = (s16)a.half[i - 16] &gt;&gt; imm;\n    dst.byte[i] = clamp&lt;s16&gt;(temp, -128, 127);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvssrani_bu_h (__m256i a, __m256i b, imm0_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvssrani_bu_h (__m256i a, __m256i b, imm0_15 imm)\n#include <lasxintrin.h>\nInstruction: xvssrani.bu.h xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nArithmetic right shift the signed 16-bit elements in `a` and `b` by `imm`, clamp to fit in unsigned 8-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  if (i < 8) {\n    s16 temp = (s16)b.half[i] >> imm;\n    dst.byte[i] = clamp<s16>(temp, 0, 255);\n  } else {\n    s16 temp = (s16)a.half[i - 8] >> imm;\n    dst.byte[i] = clamp<s16>(temp, 0, 255);\n  }\n}\nfor (int i = 16; i < 32; i++) {\n  if (i < 24) {\n    s16 temp = (s16)b.half[i - 8] >> imm;\n    dst.byte[i] = clamp<s16>(temp, 0, 255);\n  } else {\n    s16 temp = (s16)a.half[i - 16] >> imm;\n    dst.byte[i] = clamp<s16>(temp, 0, 255);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvssrani_bu_h (__m256i a, __m256i b, imm0_15 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvssrani.bu.h xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift the signed 16-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in unsigned 8-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  if (i &lt; 8) {\n    s16 temp = (s16)b.half[i] &gt;&gt; imm;\n    dst.byte[i] = clamp&lt;s16&gt;(temp, 0, 255);\n  } else {\n    s16 temp = (s16)a.half[i - 8] &gt;&gt; imm;\n    dst.byte[i] = clamp&lt;s16&gt;(temp, 0, 255);\n  }\n}\nfor (int i = 16; i &lt; 32; i++) {\n  if (i &lt; 24) {\n    s16 temp = (s16)b.half[i - 8] &gt;&gt; imm;\n    dst.byte[i] = clamp&lt;s16&gt;(temp, 0, 255);\n  } else {\n    s16 temp = (s16)a.half[i - 16] &gt;&gt; imm;\n    dst.byte[i] = clamp&lt;s16&gt;(temp, 0, 255);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvssrani_d_q (__m256i a, __m256i b, imm0_127 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvssrani_d_q (__m256i a, __m256i b, imm0_127 imm)\n#include <lasxintrin.h>\nInstruction: xvssrani.d.q xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nArithmetic right shift the signed 128-bit elements in `a` and `b` by `imm`, clamp to fit in signed 64-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  if (i < 1) {\n    s128 temp = (s128)b.qword[i] >> imm;\n    dst.dword[i] = clamp<s128>(temp, -9223372036854775808, 9223372036854775807);\n  } else {\n    s128 temp = (s128)a.qword[i - 1] >> imm;\n    dst.dword[i] = clamp<s128>(temp, -9223372036854775808, 9223372036854775807);\n  }\n}\nfor (int i = 2; i < 4; i++) {\n  if (i < 3) {\n    s128 temp = (s128)b.qword[i - 1] >> imm;\n    dst.dword[i] = clamp<s128>(temp, -9223372036854775808, 9223372036854775807);\n  } else {\n    s128 temp = (s128)a.qword[i - 2] >> imm;\n    dst.dword[i] = clamp<s128>(temp, -9223372036854775808, 9223372036854775807);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvssrani_d_q (__m256i a, __m256i b, imm0_127 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvssrani.d.q xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift the signed 128-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in signed 64-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  if (i &lt; 1) {\n    s128 temp = (s128)b.qword[i] &gt;&gt; imm;\n    dst.dword[i] = clamp&lt;s128&gt;(temp, -9223372036854775808, 9223372036854775807);\n  } else {\n    s128 temp = (s128)a.qword[i - 1] &gt;&gt; imm;\n    dst.dword[i] = clamp&lt;s128&gt;(temp, -9223372036854775808, 9223372036854775807);\n  }\n}\nfor (int i = 2; i &lt; 4; i++) {\n  if (i &lt; 3) {\n    s128 temp = (s128)b.qword[i - 1] &gt;&gt; imm;\n    dst.dword[i] = clamp&lt;s128&gt;(temp, -9223372036854775808, 9223372036854775807);\n  } else {\n    s128 temp = (s128)a.qword[i - 2] &gt;&gt; imm;\n    dst.dword[i] = clamp&lt;s128&gt;(temp, -9223372036854775808, 9223372036854775807);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvssrani_du_q (__m256i a, __m256i b, imm0_127 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvssrani_du_q (__m256i a, __m256i b, imm0_127 imm)\n#include <lasxintrin.h>\nInstruction: xvssrani.du.q xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nArithmetic right shift the signed 128-bit elements in `a` and `b` by `imm`, clamp to fit in unsigned 64-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  if (i < 1) {\n    s128 temp = (s128)b.qword[i] >> imm;\n    dst.dword[i] = clamp<s128>(temp, 0, 18446744073709551615);\n  } else {\n    s128 temp = (s128)a.qword[i - 1] >> imm;\n    dst.dword[i] = clamp<s128>(temp, 0, 18446744073709551615);\n  }\n}\nfor (int i = 2; i < 4; i++) {\n  if (i < 3) {\n    s128 temp = (s128)b.qword[i - 1] >> imm;\n    dst.dword[i] = clamp<s128>(temp, 0, 18446744073709551615);\n  } else {\n    s128 temp = (s128)a.qword[i - 2] >> imm;\n    dst.dword[i] = clamp<s128>(temp, 0, 18446744073709551615);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvssrani_du_q (__m256i a, __m256i b, imm0_127 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvssrani.du.q xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift the signed 128-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in unsigned 64-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  if (i &lt; 1) {\n    s128 temp = (s128)b.qword[i] &gt;&gt; imm;\n    dst.dword[i] = clamp&lt;s128&gt;(temp, 0, 18446744073709551615);\n  } else {\n    s128 temp = (s128)a.qword[i - 1] &gt;&gt; imm;\n    dst.dword[i] = clamp&lt;s128&gt;(temp, 0, 18446744073709551615);\n  }\n}\nfor (int i = 2; i &lt; 4; i++) {\n  if (i &lt; 3) {\n    s128 temp = (s128)b.qword[i - 1] &gt;&gt; imm;\n    dst.dword[i] = clamp&lt;s128&gt;(temp, 0, 18446744073709551615);\n  } else {\n    s128 temp = (s128)a.qword[i - 2] &gt;&gt; imm;\n    dst.dword[i] = clamp&lt;s128&gt;(temp, 0, 18446744073709551615);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvssrani_h_w (__m256i a, __m256i b, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvssrani_h_w (__m256i a, __m256i b, imm0_31 imm)\n#include <lasxintrin.h>\nInstruction: xvssrani.h.w xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nArithmetic right shift the signed 32-bit elements in `a` and `b` by `imm`, clamp to fit in signed 16-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  if (i < 4) {\n    s32 temp = (s32)b.word[i] >> imm;\n    dst.half[i] = clamp<s32>(temp, -32768, 32767);\n  } else {\n    s32 temp = (s32)a.word[i - 4] >> imm;\n    dst.half[i] = clamp<s32>(temp, -32768, 32767);\n  }\n}\nfor (int i = 8; i < 16; i++) {\n  if (i < 12) {\n    s32 temp = (s32)b.word[i - 4] >> imm;\n    dst.half[i] = clamp<s32>(temp, -32768, 32767);\n  } else {\n    s32 temp = (s32)a.word[i - 8] >> imm;\n    dst.half[i] = clamp<s32>(temp, -32768, 32767);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvssrani_h_w (__m256i a, __m256i b, imm0_31 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvssrani.h.w xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift the signed 32-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in signed 16-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  if (i &lt; 4) {\n    s32 temp = (s32)b.word[i] &gt;&gt; imm;\n    dst.half[i] = clamp&lt;s32&gt;(temp, -32768, 32767);\n  } else {\n    s32 temp = (s32)a.word[i - 4] &gt;&gt; imm;\n    dst.half[i] = clamp&lt;s32&gt;(temp, -32768, 32767);\n  }\n}\nfor (int i = 8; i &lt; 16; i++) {\n  if (i &lt; 12) {\n    s32 temp = (s32)b.word[i - 4] &gt;&gt; imm;\n    dst.half[i] = clamp&lt;s32&gt;(temp, -32768, 32767);\n  } else {\n    s32 temp = (s32)a.word[i - 8] &gt;&gt; imm;\n    dst.half[i] = clamp&lt;s32&gt;(temp, -32768, 32767);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvssrani_hu_w (__m256i a, __m256i b, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvssrani_hu_w (__m256i a, __m256i b, imm0_31 imm)\n#include <lasxintrin.h>\nInstruction: xvssrani.hu.w xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nArithmetic right shift the signed 32-bit elements in `a` and `b` by `imm`, clamp to fit in unsigned 16-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  if (i < 4) {\n    s32 temp = (s32)b.word[i] >> imm;\n    dst.half[i] = clamp<s32>(temp, 0, 65535);\n  } else {\n    s32 temp = (s32)a.word[i - 4] >> imm;\n    dst.half[i] = clamp<s32>(temp, 0, 65535);\n  }\n}\nfor (int i = 8; i < 16; i++) {\n  if (i < 12) {\n    s32 temp = (s32)b.word[i - 4] >> imm;\n    dst.half[i] = clamp<s32>(temp, 0, 65535);\n  } else {\n    s32 temp = (s32)a.word[i - 8] >> imm;\n    dst.half[i] = clamp<s32>(temp, 0, 65535);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvssrani_hu_w (__m256i a, __m256i b, imm0_31 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvssrani.hu.w xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift the signed 32-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in unsigned 16-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  if (i &lt; 4) {\n    s32 temp = (s32)b.word[i] &gt;&gt; imm;\n    dst.half[i] = clamp&lt;s32&gt;(temp, 0, 65535);\n  } else {\n    s32 temp = (s32)a.word[i - 4] &gt;&gt; imm;\n    dst.half[i] = clamp&lt;s32&gt;(temp, 0, 65535);\n  }\n}\nfor (int i = 8; i &lt; 16; i++) {\n  if (i &lt; 12) {\n    s32 temp = (s32)b.word[i - 4] &gt;&gt; imm;\n    dst.half[i] = clamp&lt;s32&gt;(temp, 0, 65535);\n  } else {\n    s32 temp = (s32)a.word[i - 8] &gt;&gt; imm;\n    dst.half[i] = clamp&lt;s32&gt;(temp, 0, 65535);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvssrani_w_d (__m256i a, __m256i b, imm0_63 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvssrani_w_d (__m256i a, __m256i b, imm0_63 imm)\n#include <lasxintrin.h>\nInstruction: xvssrani.w.d xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nArithmetic right shift the signed 64-bit elements in `a` and `b` by `imm`, clamp to fit in signed 32-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (i < 2) {\n    s64 temp = (s64)b.dword[i] >> imm;\n    dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);\n  } else {\n    s64 temp = (s64)a.dword[i - 2] >> imm;\n    dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);\n  }\n}\nfor (int i = 4; i < 8; i++) {\n  if (i < 6) {\n    s64 temp = (s64)b.dword[i - 2] >> imm;\n    dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);\n  } else {\n    s64 temp = (s64)a.dword[i - 4] >> imm;\n    dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvssrani_w_d (__m256i a, __m256i b, imm0_63 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvssrani.w.d xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift the signed 64-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in signed 32-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (i &lt; 2) {\n    s64 temp = (s64)b.dword[i] &gt;&gt; imm;\n    dst.word[i] = clamp&lt;s64&gt;(temp, -2147483648, 2147483647);\n  } else {\n    s64 temp = (s64)a.dword[i - 2] &gt;&gt; imm;\n    dst.word[i] = clamp&lt;s64&gt;(temp, -2147483648, 2147483647);\n  }\n}\nfor (int i = 4; i &lt; 8; i++) {\n  if (i &lt; 6) {\n    s64 temp = (s64)b.dword[i - 2] &gt;&gt; imm;\n    dst.word[i] = clamp&lt;s64&gt;(temp, -2147483648, 2147483647);\n  } else {\n    s64 temp = (s64)a.dword[i - 4] &gt;&gt; imm;\n    dst.word[i] = clamp&lt;s64&gt;(temp, -2147483648, 2147483647);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvssrani_wu_d (__m256i a, __m256i b, imm0_63 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvssrani_wu_d (__m256i a, __m256i b, imm0_63 imm)\n#include <lasxintrin.h>\nInstruction: xvssrani.wu.d xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nArithmetic right shift the signed 64-bit elements in `a` and `b` by `imm`, clamp to fit in unsigned 32-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (i < 2) {\n    s64 temp = (s64)b.dword[i] >> imm;\n    dst.word[i] = clamp<s64>(temp, 0, 4294967295);\n  } else {\n    s64 temp = (s64)a.dword[i - 2] >> imm;\n    dst.word[i] = clamp<s64>(temp, 0, 4294967295);\n  }\n}\nfor (int i = 4; i < 8; i++) {\n  if (i < 6) {\n    s64 temp = (s64)b.dword[i - 2] >> imm;\n    dst.word[i] = clamp<s64>(temp, 0, 4294967295);\n  } else {\n    s64 temp = (s64)a.dword[i - 4] >> imm;\n    dst.word[i] = clamp<s64>(temp, 0, 4294967295);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvssrani_wu_d (__m256i a, __m256i b, imm0_63 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvssrani.wu.d xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift the signed 64-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in unsigned 32-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (i &lt; 2) {\n    s64 temp = (s64)b.dword[i] &gt;&gt; imm;\n    dst.word[i] = clamp&lt;s64&gt;(temp, 0, 4294967295);\n  } else {\n    s64 temp = (s64)a.dword[i - 2] &gt;&gt; imm;\n    dst.word[i] = clamp&lt;s64&gt;(temp, 0, 4294967295);\n  }\n}\nfor (int i = 4; i &lt; 8; i++) {\n  if (i &lt; 6) {\n    s64 temp = (s64)b.dword[i - 2] &gt;&gt; imm;\n    dst.word[i] = clamp&lt;s64&gt;(temp, 0, 4294967295);\n  } else {\n    s64 temp = (s64)a.dword[i - 4] &gt;&gt; imm;\n    dst.word[i] = clamp&lt;s64&gt;(temp, 0, 4294967295);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvssrarn_b_h (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvssrarn_b_h (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvssrarn.b.h xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nArithmetic right shift (with rounding) the signed 16-bit elements in `a` by elements in `b`, clamp to fit in signed 8-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  if (i < 8) {\n    s16 temp;\n    if ((b.half[i] & 15) == 0) {\n      temp = (s16)a.half[i];\n    } else {\n      temp = ((s16)a.half[i] >> (b.half[i] & 15)) +\n             (((s16)a.half[i] >> ((b.half[i] & 15) - 1)) & 1);\n    }\n    dst.byte[i] = clamp<s16>(temp, -128, 127);\n  } else {\n    dst.byte[i] = 0;\n  }\n}\nfor (int i = 16; i < 32; i++) {\n  if (i < 24) {\n    s16 temp;\n    if ((b.half[i - 8] & 15) == 0) {\n      temp = (s16)a.half[i - 8];\n    } else {\n      temp = ((s16)a.half[i - 8] >> (b.half[i - 8] & 15)) +\n             (((s16)a.half[i - 8] >> ((b.half[i - 8] & 15) - 1)) & 1);\n    }\n    dst.byte[i] = clamp<s16>(temp, -128, 127);\n  } else {\n    dst.byte[i] = 0;\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvssrarn_b_h (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvssrarn.b.h xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift (with rounding) the signed 16-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in signed 8-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  if (i &lt; 8) {\n    s16 temp;\n    if ((b.half[i] &amp; 15) == 0) {\n      temp = (s16)a.half[i];\n    } else {\n      temp = ((s16)a.half[i] &gt;&gt; (b.half[i] &amp; 15)) +\n             (((s16)a.half[i] &gt;&gt; ((b.half[i] &amp; 15) - 1)) &amp; 1);\n    }\n    dst.byte[i] = clamp&lt;s16&gt;(temp, -128, 127);\n  } else {\n    dst.byte[i] = 0;\n  }\n}\nfor (int i = 16; i &lt; 32; i++) {\n  if (i &lt; 24) {\n    s16 temp;\n    if ((b.half[i - 8] &amp; 15) == 0) {\n      temp = (s16)a.half[i - 8];\n    } else {\n      temp = ((s16)a.half[i - 8] &gt;&gt; (b.half[i - 8] &amp; 15)) +\n             (((s16)a.half[i - 8] &gt;&gt; ((b.half[i - 8] &amp; 15) - 1)) &amp; 1);\n    }\n    dst.byte[i] = clamp&lt;s16&gt;(temp, -128, 127);\n  } else {\n    dst.byte[i] = 0;\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvssrarn_bu_h (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvssrarn_bu_h (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvssrarn.bu.h xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nArithmetic right shift (with rounding) the signed 16-bit elements in `a` by elements in `b`, clamp to fit in unsigned 8-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  if (i < 8) {\n    s16 temp;\n    if ((b.half[i] & 15) == 0) {\n      temp = (s16)a.half[i];\n    } else {\n      temp = ((s16)a.half[i] >> (b.half[i] & 15)) +\n             (((s16)a.half[i] >> ((b.half[i] & 15) - 1)) & 1);\n    }\n    dst.byte[i] = clamp<s16>(temp, 0, 255);\n  } else {\n    dst.byte[i] = 0;\n  }\n}\nfor (int i = 16; i < 32; i++) {\n  if (i < 24) {\n    s16 temp;\n    if ((b.half[i - 8] & 15) == 0) {\n      temp = (s16)a.half[i - 8];\n    } else {\n      temp = ((s16)a.half[i - 8] >> (b.half[i - 8] & 15)) +\n             (((s16)a.half[i - 8] >> ((b.half[i - 8] & 15) - 1)) & 1);\n    }\n    dst.byte[i] = clamp<s16>(temp, 0, 255);\n  } else {\n    dst.byte[i] = 0;\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvssrarn_bu_h (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvssrarn.bu.h xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift (with rounding) the signed 16-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in unsigned 8-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  if (i &lt; 8) {\n    s16 temp;\n    if ((b.half[i] &amp; 15) == 0) {\n      temp = (s16)a.half[i];\n    } else {\n      temp = ((s16)a.half[i] &gt;&gt; (b.half[i] &amp; 15)) +\n             (((s16)a.half[i] &gt;&gt; ((b.half[i] &amp; 15) - 1)) &amp; 1);\n    }\n    dst.byte[i] = clamp&lt;s16&gt;(temp, 0, 255);\n  } else {\n    dst.byte[i] = 0;\n  }\n}\nfor (int i = 16; i &lt; 32; i++) {\n  if (i &lt; 24) {\n    s16 temp;\n    if ((b.half[i - 8] &amp; 15) == 0) {\n      temp = (s16)a.half[i - 8];\n    } else {\n      temp = ((s16)a.half[i - 8] &gt;&gt; (b.half[i - 8] &amp; 15)) +\n             (((s16)a.half[i - 8] &gt;&gt; ((b.half[i - 8] &amp; 15) - 1)) &amp; 1);\n    }\n    dst.byte[i] = clamp&lt;s16&gt;(temp, 0, 255);\n  } else {\n    dst.byte[i] = 0;\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvssrarn_h_w (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvssrarn_h_w (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvssrarn.h.w xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nArithmetic right shift (with rounding) the signed 32-bit elements in `a` by elements in `b`, clamp to fit in signed 16-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  if (i < 4) {\n    s32 temp;\n    if ((b.word[i] & 31) == 0) {\n      temp = (s32)a.word[i];\n    } else {\n      temp = ((s32)a.word[i] >> (b.word[i] & 31)) +\n             (((s32)a.word[i] >> ((b.word[i] & 31) - 1)) & 1);\n    }\n    dst.half[i] = clamp<s32>(temp, -32768, 32767);\n  } else {\n    dst.half[i] = 0;\n  }\n}\nfor (int i = 8; i < 16; i++) {\n  if (i < 12) {\n    s32 temp;\n    if ((b.word[i - 4] & 31) == 0) {\n      temp = (s32)a.word[i - 4];\n    } else {\n      temp = ((s32)a.word[i - 4] >> (b.word[i - 4] & 31)) +\n             (((s32)a.word[i - 4] >> ((b.word[i - 4] & 31) - 1)) & 1);\n    }\n    dst.half[i] = clamp<s32>(temp, -32768, 32767);\n  } else {\n    dst.half[i] = 0;\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvssrarn_h_w (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvssrarn.h.w xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift (with rounding) the signed 32-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in signed 16-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  if (i &lt; 4) {\n    s32 temp;\n    if ((b.word[i] &amp; 31) == 0) {\n      temp = (s32)a.word[i];\n    } else {\n      temp = ((s32)a.word[i] &gt;&gt; (b.word[i] &amp; 31)) +\n             (((s32)a.word[i] &gt;&gt; ((b.word[i] &amp; 31) - 1)) &amp; 1);\n    }\n    dst.half[i] = clamp&lt;s32&gt;(temp, -32768, 32767);\n  } else {\n    dst.half[i] = 0;\n  }\n}\nfor (int i = 8; i &lt; 16; i++) {\n  if (i &lt; 12) {\n    s32 temp;\n    if ((b.word[i - 4] &amp; 31) == 0) {\n      temp = (s32)a.word[i - 4];\n    } else {\n      temp = ((s32)a.word[i - 4] &gt;&gt; (b.word[i - 4] &amp; 31)) +\n             (((s32)a.word[i - 4] &gt;&gt; ((b.word[i - 4] &amp; 31) - 1)) &amp; 1);\n    }\n    dst.half[i] = clamp&lt;s32&gt;(temp, -32768, 32767);\n  } else {\n    dst.half[i] = 0;\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvssrarn_hu_w (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvssrarn_hu_w (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvssrarn.hu.w xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nArithmetic right shift (with rounding) the signed 32-bit elements in `a` by elements in `b`, clamp to fit in unsigned 16-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  if (i < 4) {\n    s32 temp;\n    if ((b.word[i] & 31) == 0) {\n      temp = (s32)a.word[i];\n    } else {\n      temp = ((s32)a.word[i] >> (b.word[i] & 31)) +\n             (((s32)a.word[i] >> ((b.word[i] & 31) - 1)) & 1);\n    }\n    dst.half[i] = clamp<s32>(temp, 0, 65535);\n  } else {\n    dst.half[i] = 0;\n  }\n}\nfor (int i = 8; i < 16; i++) {\n  if (i < 12) {\n    s32 temp;\n    if ((b.word[i - 4] & 31) == 0) {\n      temp = (s32)a.word[i - 4];\n    } else {\n      temp = ((s32)a.word[i - 4] >> (b.word[i - 4] & 31)) +\n             (((s32)a.word[i - 4] >> ((b.word[i - 4] & 31) - 1)) & 1);\n    }\n    dst.half[i] = clamp<s32>(temp, 0, 65535);\n  } else {\n    dst.half[i] = 0;\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvssrarn_hu_w (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvssrarn.hu.w xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift (with rounding) the signed 32-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in unsigned 16-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  if (i &lt; 4) {\n    s32 temp;\n    if ((b.word[i] &amp; 31) == 0) {\n      temp = (s32)a.word[i];\n    } else {\n      temp = ((s32)a.word[i] &gt;&gt; (b.word[i] &amp; 31)) +\n             (((s32)a.word[i] &gt;&gt; ((b.word[i] &amp; 31) - 1)) &amp; 1);\n    }\n    dst.half[i] = clamp&lt;s32&gt;(temp, 0, 65535);\n  } else {\n    dst.half[i] = 0;\n  }\n}\nfor (int i = 8; i &lt; 16; i++) {\n  if (i &lt; 12) {\n    s32 temp;\n    if ((b.word[i - 4] &amp; 31) == 0) {\n      temp = (s32)a.word[i - 4];\n    } else {\n      temp = ((s32)a.word[i - 4] &gt;&gt; (b.word[i - 4] &amp; 31)) +\n             (((s32)a.word[i - 4] &gt;&gt; ((b.word[i - 4] &amp; 31) - 1)) &amp; 1);\n    }\n    dst.half[i] = clamp&lt;s32&gt;(temp, 0, 65535);\n  } else {\n    dst.half[i] = 0;\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvssrarn_w_d (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvssrarn_w_d (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvssrarn.w.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nArithmetic right shift (with rounding) the signed 64-bit elements in `a` by elements in `b`, clamp to fit in signed 32-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (i < 2) {\n    s64 temp;\n    if ((b.dword[i] & 63) == 0) {\n      temp = (s64)a.dword[i];\n    } else {\n      temp = ((s64)a.dword[i] >> (b.dword[i] & 63)) +\n             (((s64)a.dword[i] >> ((b.dword[i] & 63) - 1)) & 1);\n    }\n    dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);\n  } else {\n    dst.word[i] = 0;\n  }\n}\nfor (int i = 4; i < 8; i++) {\n  if (i < 6) {\n    s64 temp;\n    if ((b.dword[i - 2] & 63) == 0) {\n      temp = (s64)a.dword[i - 2];\n    } else {\n      temp = ((s64)a.dword[i - 2] >> (b.dword[i - 2] & 63)) +\n             (((s64)a.dword[i - 2] >> ((b.dword[i - 2] & 63) - 1)) & 1);\n    }\n    dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);\n  } else {\n    dst.word[i] = 0;\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvssrarn_w_d (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvssrarn.w.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift (with rounding) the signed 64-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in signed 32-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (i &lt; 2) {\n    s64 temp;\n    if ((b.dword[i] &amp; 63) == 0) {\n      temp = (s64)a.dword[i];\n    } else {\n      temp = ((s64)a.dword[i] &gt;&gt; (b.dword[i] &amp; 63)) +\n             (((s64)a.dword[i] &gt;&gt; ((b.dword[i] &amp; 63) - 1)) &amp; 1);\n    }\n    dst.word[i] = clamp&lt;s64&gt;(temp, -2147483648, 2147483647);\n  } else {\n    dst.word[i] = 0;\n  }\n}\nfor (int i = 4; i &lt; 8; i++) {\n  if (i &lt; 6) {\n    s64 temp;\n    if ((b.dword[i - 2] &amp; 63) == 0) {\n      temp = (s64)a.dword[i - 2];\n    } else {\n      temp = ((s64)a.dword[i - 2] &gt;&gt; (b.dword[i - 2] &amp; 63)) +\n             (((s64)a.dword[i - 2] &gt;&gt; ((b.dword[i - 2] &amp; 63) - 1)) &amp; 1);\n    }\n    dst.word[i] = clamp&lt;s64&gt;(temp, -2147483648, 2147483647);\n  } else {\n    dst.word[i] = 0;\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvssrarn_wu_d (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvssrarn_wu_d (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvssrarn.wu.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nArithmetic right shift (with rounding) the signed 64-bit elements in `a` by elements in `b`, clamp to fit in unsigned 32-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (i < 2) {\n    s64 temp;\n    if ((b.dword[i] & 63) == 0) {\n      temp = (s64)a.dword[i];\n    } else {\n      temp = ((s64)a.dword[i] >> (b.dword[i] & 63)) +\n             (((s64)a.dword[i] >> ((b.dword[i] & 63) - 1)) & 1);\n    }\n    dst.word[i] = clamp<s64>(temp, 0, 4294967295);\n  } else {\n    dst.word[i] = 0;\n  }\n}\nfor (int i = 4; i < 8; i++) {\n  if (i < 6) {\n    s64 temp;\n    if ((b.dword[i - 2] & 63) == 0) {\n      temp = (s64)a.dword[i - 2];\n    } else {\n      temp = ((s64)a.dword[i - 2] >> (b.dword[i - 2] & 63)) +\n             (((s64)a.dword[i - 2] >> ((b.dword[i - 2] & 63) - 1)) & 1);\n    }\n    dst.word[i] = clamp<s64>(temp, 0, 4294967295);\n  } else {\n    dst.word[i] = 0;\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvssrarn_wu_d (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvssrarn.wu.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift (with rounding) the signed 64-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in unsigned 32-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (i &lt; 2) {\n    s64 temp;\n    if ((b.dword[i] &amp; 63) == 0) {\n      temp = (s64)a.dword[i];\n    } else {\n      temp = ((s64)a.dword[i] &gt;&gt; (b.dword[i] &amp; 63)) +\n             (((s64)a.dword[i] &gt;&gt; ((b.dword[i] &amp; 63) - 1)) &amp; 1);\n    }\n    dst.word[i] = clamp&lt;s64&gt;(temp, 0, 4294967295);\n  } else {\n    dst.word[i] = 0;\n  }\n}\nfor (int i = 4; i &lt; 8; i++) {\n  if (i &lt; 6) {\n    s64 temp;\n    if ((b.dword[i - 2] &amp; 63) == 0) {\n      temp = (s64)a.dword[i - 2];\n    } else {\n      temp = ((s64)a.dword[i - 2] &gt;&gt; (b.dword[i - 2] &amp; 63)) +\n             (((s64)a.dword[i - 2] &gt;&gt; ((b.dword[i - 2] &amp; 63) - 1)) &amp; 1);\n    }\n    dst.word[i] = clamp&lt;s64&gt;(temp, 0, 4294967295);\n  } else {\n    dst.word[i] = 0;\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvssrarni_b_h (__m256i a, __m256i b, imm0_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvssrarni_b_h (__m256i a, __m256i b, imm0_15 imm)\n#include <lasxintrin.h>\nInstruction: xvssrarni.b.h xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nArithmetic right shift (with rounding) the signed 16-bit elements in `a` and `b` by `imm`, clamp to fit in signed 8-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  if (i < 8) {\n    s16 temp;\n    if (imm == 0) {\n      temp = (s16)b.half[i];\n    } else {\n      temp = ((s16)b.half[i] >> imm) + (((s16)b.half[i] >> (imm - 1)) & 1);\n    }\n    dst.byte[i] = clamp<s16>(temp, -128, 127);\n  } else {\n    s16 temp;\n    if (imm == 0) {\n      temp = (s16)a.half[i - 8];\n    } else {\n      temp =\n          ((s16)a.half[i - 8] >> imm) + (((s16)a.half[i - 8] >> (imm - 1)) & 1);\n    }\n    dst.byte[i] = clamp<s16>(temp, -128, 127);\n  }\n}\nfor (int i = 16; i < 32; i++) {\n  if (i < 24) {\n    s16 temp;\n    if (imm == 0) {\n      temp = (s16)b.half[i - 8];\n    } else {\n      temp =\n          ((s16)b.half[i - 8] >> imm) + (((s16)b.half[i - 8] >> (imm - 1)) & 1);\n    }\n    dst.byte[i] = clamp<s16>(temp, -128, 127);\n  } else {\n    s16 temp;\n    if (imm == 0) {\n      temp = (s16)a.half[i - 16];\n    } else {\n      temp = ((s16)a.half[i - 16] >> imm) +\n             (((s16)a.half[i - 16] >> (imm - 1)) & 1);\n    }\n    dst.byte[i] = clamp<s16>(temp, -128, 127);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvssrarni_b_h (__m256i a, __m256i b, imm0_15 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvssrarni.b.h xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift (with rounding) the signed 16-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in signed 8-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  if (i &lt; 8) {\n    s16 temp;\n    if (imm == 0) {\n      temp = (s16)b.half[i];\n    } else {\n      temp = ((s16)b.half[i] &gt;&gt; imm) + (((s16)b.half[i] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.byte[i] = clamp&lt;s16&gt;(temp, -128, 127);\n  } else {\n    s16 temp;\n    if (imm == 0) {\n      temp = (s16)a.half[i - 8];\n    } else {\n      temp =\n          ((s16)a.half[i - 8] &gt;&gt; imm) + (((s16)a.half[i - 8] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.byte[i] = clamp&lt;s16&gt;(temp, -128, 127);\n  }\n}\nfor (int i = 16; i &lt; 32; i++) {\n  if (i &lt; 24) {\n    s16 temp;\n    if (imm == 0) {\n      temp = (s16)b.half[i - 8];\n    } else {\n      temp =\n          ((s16)b.half[i - 8] &gt;&gt; imm) + (((s16)b.half[i - 8] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.byte[i] = clamp&lt;s16&gt;(temp, -128, 127);\n  } else {\n    s16 temp;\n    if (imm == 0) {\n      temp = (s16)a.half[i - 16];\n    } else {\n      temp = ((s16)a.half[i - 16] &gt;&gt; imm) +\n             (((s16)a.half[i - 16] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.byte[i] = clamp&lt;s16&gt;(temp, -128, 127);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvssrarni_bu_h (__m256i a, __m256i b, imm0_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvssrarni_bu_h (__m256i a, __m256i b, imm0_15 imm)\n#include <lasxintrin.h>\nInstruction: xvssrarni.bu.h xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nArithmetic right shift (with rounding) the signed 16-bit elements in `a` and `b` by `imm`, clamp to fit in unsigned 8-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  if (i < 8) {\n    s16 temp;\n    if (imm == 0) {\n      temp = (s16)b.half[i];\n    } else {\n      temp = ((s16)b.half[i] >> imm) + (((s16)b.half[i] >> (imm - 1)) & 1);\n    }\n    dst.byte[i] = clamp<s16>(temp, 0, 255);\n  } else {\n    s16 temp;\n    if (imm == 0) {\n      temp = (s16)a.half[i - 8];\n    } else {\n      temp =\n          ((s16)a.half[i - 8] >> imm) + (((s16)a.half[i - 8] >> (imm - 1)) & 1);\n    }\n    dst.byte[i] = clamp<s16>(temp, 0, 255);\n  }\n}\nfor (int i = 16; i < 32; i++) {\n  if (i < 24) {\n    s16 temp;\n    if (imm == 0) {\n      temp = (s16)b.half[i - 8];\n    } else {\n      temp =\n          ((s16)b.half[i - 8] >> imm) + (((s16)b.half[i - 8] >> (imm - 1)) & 1);\n    }\n    dst.byte[i] = clamp<s16>(temp, 0, 255);\n  } else {\n    s16 temp;\n    if (imm == 0) {\n      temp = (s16)a.half[i - 16];\n    } else {\n      temp = ((s16)a.half[i - 16] >> imm) +\n             (((s16)a.half[i - 16] >> (imm - 1)) & 1);\n    }\n    dst.byte[i] = clamp<s16>(temp, 0, 255);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvssrarni_bu_h (__m256i a, __m256i b, imm0_15 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvssrarni.bu.h xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift (with rounding) the signed 16-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in unsigned 8-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  if (i &lt; 8) {\n    s16 temp;\n    if (imm == 0) {\n      temp = (s16)b.half[i];\n    } else {\n      temp = ((s16)b.half[i] &gt;&gt; imm) + (((s16)b.half[i] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.byte[i] = clamp&lt;s16&gt;(temp, 0, 255);\n  } else {\n    s16 temp;\n    if (imm == 0) {\n      temp = (s16)a.half[i - 8];\n    } else {\n      temp =\n          ((s16)a.half[i - 8] &gt;&gt; imm) + (((s16)a.half[i - 8] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.byte[i] = clamp&lt;s16&gt;(temp, 0, 255);\n  }\n}\nfor (int i = 16; i &lt; 32; i++) {\n  if (i &lt; 24) {\n    s16 temp;\n    if (imm == 0) {\n      temp = (s16)b.half[i - 8];\n    } else {\n      temp =\n          ((s16)b.half[i - 8] &gt;&gt; imm) + (((s16)b.half[i - 8] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.byte[i] = clamp&lt;s16&gt;(temp, 0, 255);\n  } else {\n    s16 temp;\n    if (imm == 0) {\n      temp = (s16)a.half[i - 16];\n    } else {\n      temp = ((s16)a.half[i - 16] &gt;&gt; imm) +\n             (((s16)a.half[i - 16] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.byte[i] = clamp&lt;s16&gt;(temp, 0, 255);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvssrarni_d_q (__m256i a, __m256i b, imm0_127 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvssrarni_d_q (__m256i a, __m256i b, imm0_127 imm)\n#include <lasxintrin.h>\nInstruction: xvssrarni.d.q xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nArithmetic right shift (with rounding) the signed 128-bit elements in `a` and `b` by `imm`, clamp to fit in signed 64-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  if (i < 1) {\n    s128 temp;\n    if (imm == 0) {\n      temp = (s128)b.qword[i];\n    } else {\n      temp = ((s128)b.qword[i] >> imm) + (((s128)b.qword[i] >> (imm - 1)) & 1);\n    }\n    dst.dword[i] = clamp<s128>(temp, -9223372036854775808, 9223372036854775807);\n  } else {\n    s128 temp;\n    if (imm == 0) {\n      temp = (s128)a.qword[i - 1];\n    } else {\n      temp = ((s128)a.qword[i - 1] >> imm) +\n             (((s128)a.qword[i - 1] >> (imm - 1)) & 1);\n    }\n    dst.dword[i] = clamp<s128>(temp, -9223372036854775808, 9223372036854775807);\n  }\n}\nfor (int i = 2; i < 4; i++) {\n  if (i < 3) {\n    s128 temp;\n    if (imm == 0) {\n      temp = (s128)b.qword[i - 1];\n    } else {\n      temp = ((s128)b.qword[i - 1] >> imm) +\n             (((s128)b.qword[i - 1] >> (imm - 1)) & 1);\n    }\n    dst.dword[i] = clamp<s128>(temp, -9223372036854775808, 9223372036854775807);\n  } else {\n    s128 temp;\n    if (imm == 0) {\n      temp = (s128)a.qword[i - 2];\n    } else {\n      temp = ((s128)a.qword[i - 2] >> imm) +\n             (((s128)a.qword[i - 2] >> (imm - 1)) & 1);\n    }\n    dst.dword[i] = clamp<s128>(temp, -9223372036854775808, 9223372036854775807);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvssrarni_d_q (__m256i a, __m256i b, imm0_127 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvssrarni.d.q xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift (with rounding) the signed 128-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in signed 64-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  if (i &lt; 1) {\n    s128 temp;\n    if (imm == 0) {\n      temp = (s128)b.qword[i];\n    } else {\n      temp = ((s128)b.qword[i] &gt;&gt; imm) + (((s128)b.qword[i] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.dword[i] = clamp&lt;s128&gt;(temp, -9223372036854775808, 9223372036854775807);\n  } else {\n    s128 temp;\n    if (imm == 0) {\n      temp = (s128)a.qword[i - 1];\n    } else {\n      temp = ((s128)a.qword[i - 1] &gt;&gt; imm) +\n             (((s128)a.qword[i - 1] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.dword[i] = clamp&lt;s128&gt;(temp, -9223372036854775808, 9223372036854775807);\n  }\n}\nfor (int i = 2; i &lt; 4; i++) {\n  if (i &lt; 3) {\n    s128 temp;\n    if (imm == 0) {\n      temp = (s128)b.qword[i - 1];\n    } else {\n      temp = ((s128)b.qword[i - 1] &gt;&gt; imm) +\n             (((s128)b.qword[i - 1] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.dword[i] = clamp&lt;s128&gt;(temp, -9223372036854775808, 9223372036854775807);\n  } else {\n    s128 temp;\n    if (imm == 0) {\n      temp = (s128)a.qword[i - 2];\n    } else {\n      temp = ((s128)a.qword[i - 2] &gt;&gt; imm) +\n             (((s128)a.qword[i - 2] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.dword[i] = clamp&lt;s128&gt;(temp, -9223372036854775808, 9223372036854775807);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvssrarni_du_q (__m256i a, __m256i b, imm0_127 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvssrarni_du_q (__m256i a, __m256i b, imm0_127 imm)\n#include <lasxintrin.h>\nInstruction: xvssrarni.du.q xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nArithmetic right shift (with rounding) the signed 128-bit elements in `a` and `b` by `imm`, clamp to fit in unsigned 64-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  if (i < 1) {\n    s128 temp;\n    if (imm == 0) {\n      temp = (s128)b.qword[i];\n    } else {\n      temp = ((s128)b.qword[i] >> imm) + (((s128)b.qword[i] >> (imm - 1)) & 1);\n    }\n    dst.dword[i] = clamp<s128>(temp, 0, 18446744073709551615);\n  } else {\n    s128 temp;\n    if (imm == 0) {\n      temp = (s128)a.qword[i - 1];\n    } else {\n      temp = ((s128)a.qword[i - 1] >> imm) +\n             (((s128)a.qword[i - 1] >> (imm - 1)) & 1);\n    }\n    dst.dword[i] = clamp<s128>(temp, 0, 18446744073709551615);\n  }\n}\nfor (int i = 2; i < 4; i++) {\n  if (i < 3) {\n    s128 temp;\n    if (imm == 0) {\n      temp = (s128)b.qword[i - 1];\n    } else {\n      temp = ((s128)b.qword[i - 1] >> imm) +\n             (((s128)b.qword[i - 1] >> (imm - 1)) & 1);\n    }\n    dst.dword[i] = clamp<s128>(temp, 0, 18446744073709551615);\n  } else {\n    s128 temp;\n    if (imm == 0) {\n      temp = (s128)a.qword[i - 2];\n    } else {\n      temp = ((s128)a.qword[i - 2] >> imm) +\n             (((s128)a.qword[i - 2] >> (imm - 1)) & 1);\n    }\n    dst.dword[i] = clamp<s128>(temp, 0, 18446744073709551615);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvssrarni_du_q (__m256i a, __m256i b, imm0_127 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvssrarni.du.q xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift (with rounding) the signed 128-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in unsigned 64-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  if (i &lt; 1) {\n    s128 temp;\n    if (imm == 0) {\n      temp = (s128)b.qword[i];\n    } else {\n      temp = ((s128)b.qword[i] &gt;&gt; imm) + (((s128)b.qword[i] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.dword[i] = clamp&lt;s128&gt;(temp, 0, 18446744073709551615);\n  } else {\n    s128 temp;\n    if (imm == 0) {\n      temp = (s128)a.qword[i - 1];\n    } else {\n      temp = ((s128)a.qword[i - 1] &gt;&gt; imm) +\n             (((s128)a.qword[i - 1] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.dword[i] = clamp&lt;s128&gt;(temp, 0, 18446744073709551615);\n  }\n}\nfor (int i = 2; i &lt; 4; i++) {\n  if (i &lt; 3) {\n    s128 temp;\n    if (imm == 0) {\n      temp = (s128)b.qword[i - 1];\n    } else {\n      temp = ((s128)b.qword[i - 1] &gt;&gt; imm) +\n             (((s128)b.qword[i - 1] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.dword[i] = clamp&lt;s128&gt;(temp, 0, 18446744073709551615);\n  } else {\n    s128 temp;\n    if (imm == 0) {\n      temp = (s128)a.qword[i - 2];\n    } else {\n      temp = ((s128)a.qword[i - 2] &gt;&gt; imm) +\n             (((s128)a.qword[i - 2] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.dword[i] = clamp&lt;s128&gt;(temp, 0, 18446744073709551615);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvssrarni_h_w (__m256i a, __m256i b, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvssrarni_h_w (__m256i a, __m256i b, imm0_31 imm)\n#include <lasxintrin.h>\nInstruction: xvssrarni.h.w xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nArithmetic right shift (with rounding) the signed 32-bit elements in `a` and `b` by `imm`, clamp to fit in signed 16-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  if (i < 4) {\n    s32 temp;\n    if (imm == 0) {\n      temp = (s32)b.word[i];\n    } else {\n      temp = ((s32)b.word[i] >> imm) + (((s32)b.word[i] >> (imm - 1)) & 1);\n    }\n    dst.half[i] = clamp<s32>(temp, -32768, 32767);\n  } else {\n    s32 temp;\n    if (imm == 0) {\n      temp = (s32)a.word[i - 4];\n    } else {\n      temp =\n          ((s32)a.word[i - 4] >> imm) + (((s32)a.word[i - 4] >> (imm - 1)) & 1);\n    }\n    dst.half[i] = clamp<s32>(temp, -32768, 32767);\n  }\n}\nfor (int i = 8; i < 16; i++) {\n  if (i < 12) {\n    s32 temp;\n    if (imm == 0) {\n      temp = (s32)b.word[i - 4];\n    } else {\n      temp =\n          ((s32)b.word[i - 4] >> imm) + (((s32)b.word[i - 4] >> (imm - 1)) & 1);\n    }\n    dst.half[i] = clamp<s32>(temp, -32768, 32767);\n  } else {\n    s32 temp;\n    if (imm == 0) {\n      temp = (s32)a.word[i - 8];\n    } else {\n      temp =\n          ((s32)a.word[i - 8] >> imm) + (((s32)a.word[i - 8] >> (imm - 1)) & 1);\n    }\n    dst.half[i] = clamp<s32>(temp, -32768, 32767);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvssrarni_h_w (__m256i a, __m256i b, imm0_31 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvssrarni.h.w xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift (with rounding) the signed 32-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in signed 16-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  if (i &lt; 4) {\n    s32 temp;\n    if (imm == 0) {\n      temp = (s32)b.word[i];\n    } else {\n      temp = ((s32)b.word[i] &gt;&gt; imm) + (((s32)b.word[i] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.half[i] = clamp&lt;s32&gt;(temp, -32768, 32767);\n  } else {\n    s32 temp;\n    if (imm == 0) {\n      temp = (s32)a.word[i - 4];\n    } else {\n      temp =\n          ((s32)a.word[i - 4] &gt;&gt; imm) + (((s32)a.word[i - 4] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.half[i] = clamp&lt;s32&gt;(temp, -32768, 32767);\n  }\n}\nfor (int i = 8; i &lt; 16; i++) {\n  if (i &lt; 12) {\n    s32 temp;\n    if (imm == 0) {\n      temp = (s32)b.word[i - 4];\n    } else {\n      temp =\n          ((s32)b.word[i - 4] &gt;&gt; imm) + (((s32)b.word[i - 4] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.half[i] = clamp&lt;s32&gt;(temp, -32768, 32767);\n  } else {\n    s32 temp;\n    if (imm == 0) {\n      temp = (s32)a.word[i - 8];\n    } else {\n      temp =\n          ((s32)a.word[i - 8] &gt;&gt; imm) + (((s32)a.word[i - 8] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.half[i] = clamp&lt;s32&gt;(temp, -32768, 32767);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvssrarni_hu_w (__m256i a, __m256i b, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvssrarni_hu_w (__m256i a, __m256i b, imm0_31 imm)\n#include <lasxintrin.h>\nInstruction: xvssrarni.hu.w xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nArithmetic right shift (with rounding) the signed 32-bit elements in `a` and `b` by `imm`, clamp to fit in unsigned 16-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  if (i < 4) {\n    s32 temp;\n    if (imm == 0) {\n      temp = (s32)b.word[i];\n    } else {\n      temp = ((s32)b.word[i] >> imm) + (((s32)b.word[i] >> (imm - 1)) & 1);\n    }\n    dst.half[i] = clamp<s32>(temp, 0, 65535);\n  } else {\n    s32 temp;\n    if (imm == 0) {\n      temp = (s32)a.word[i - 4];\n    } else {\n      temp =\n          ((s32)a.word[i - 4] >> imm) + (((s32)a.word[i - 4] >> (imm - 1)) & 1);\n    }\n    dst.half[i] = clamp<s32>(temp, 0, 65535);\n  }\n}\nfor (int i = 8; i < 16; i++) {\n  if (i < 12) {\n    s32 temp;\n    if (imm == 0) {\n      temp = (s32)b.word[i - 4];\n    } else {\n      temp =\n          ((s32)b.word[i - 4] >> imm) + (((s32)b.word[i - 4] >> (imm - 1)) & 1);\n    }\n    dst.half[i] = clamp<s32>(temp, 0, 65535);\n  } else {\n    s32 temp;\n    if (imm == 0) {\n      temp = (s32)a.word[i - 8];\n    } else {\n      temp =\n          ((s32)a.word[i - 8] >> imm) + (((s32)a.word[i - 8] >> (imm - 1)) & 1);\n    }\n    dst.half[i] = clamp<s32>(temp, 0, 65535);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvssrarni_hu_w (__m256i a, __m256i b, imm0_31 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvssrarni.hu.w xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift (with rounding) the signed 32-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in unsigned 16-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  if (i &lt; 4) {\n    s32 temp;\n    if (imm == 0) {\n      temp = (s32)b.word[i];\n    } else {\n      temp = ((s32)b.word[i] &gt;&gt; imm) + (((s32)b.word[i] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.half[i] = clamp&lt;s32&gt;(temp, 0, 65535);\n  } else {\n    s32 temp;\n    if (imm == 0) {\n      temp = (s32)a.word[i - 4];\n    } else {\n      temp =\n          ((s32)a.word[i - 4] &gt;&gt; imm) + (((s32)a.word[i - 4] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.half[i] = clamp&lt;s32&gt;(temp, 0, 65535);\n  }\n}\nfor (int i = 8; i &lt; 16; i++) {\n  if (i &lt; 12) {\n    s32 temp;\n    if (imm == 0) {\n      temp = (s32)b.word[i - 4];\n    } else {\n      temp =\n          ((s32)b.word[i - 4] &gt;&gt; imm) + (((s32)b.word[i - 4] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.half[i] = clamp&lt;s32&gt;(temp, 0, 65535);\n  } else {\n    s32 temp;\n    if (imm == 0) {\n      temp = (s32)a.word[i - 8];\n    } else {\n      temp =\n          ((s32)a.word[i - 8] &gt;&gt; imm) + (((s32)a.word[i - 8] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.half[i] = clamp&lt;s32&gt;(temp, 0, 65535);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvssrarni_w_d (__m256i a, __m256i b, imm0_63 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvssrarni_w_d (__m256i a, __m256i b, imm0_63 imm)\n#include <lasxintrin.h>\nInstruction: xvssrarni.w.d xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nArithmetic right shift (with rounding) the signed 64-bit elements in `a` and `b` by `imm`, clamp to fit in signed 32-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (i < 2) {\n    s64 temp;\n    if (imm == 0) {\n      temp = (s64)b.dword[i];\n    } else {\n      temp = ((s64)b.dword[i] >> imm) + (((s64)b.dword[i] >> (imm - 1)) & 1);\n    }\n    dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);\n  } else {\n    s64 temp;\n    if (imm == 0) {\n      temp = (s64)a.dword[i - 2];\n    } else {\n      temp = ((s64)a.dword[i - 2] >> imm) +\n             (((s64)a.dword[i - 2] >> (imm - 1)) & 1);\n    }\n    dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);\n  }\n}\nfor (int i = 4; i < 8; i++) {\n  if (i < 6) {\n    s64 temp;\n    if (imm == 0) {\n      temp = (s64)b.dword[i - 2];\n    } else {\n      temp = ((s64)b.dword[i - 2] >> imm) +\n             (((s64)b.dword[i - 2] >> (imm - 1)) & 1);\n    }\n    dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);\n  } else {\n    s64 temp;\n    if (imm == 0) {\n      temp = (s64)a.dword[i - 4];\n    } else {\n      temp = ((s64)a.dword[i - 4] >> imm) +\n             (((s64)a.dword[i - 4] >> (imm - 1)) & 1);\n    }\n    dst.word[i] = clamp<s64>(temp, -2147483648, 2147483647);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvssrarni_w_d (__m256i a, __m256i b, imm0_63 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvssrarni.w.d xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift (with rounding) the signed 64-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in signed 32-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (i &lt; 2) {\n    s64 temp;\n    if (imm == 0) {\n      temp = (s64)b.dword[i];\n    } else {\n      temp = ((s64)b.dword[i] &gt;&gt; imm) + (((s64)b.dword[i] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.word[i] = clamp&lt;s64&gt;(temp, -2147483648, 2147483647);\n  } else {\n    s64 temp;\n    if (imm == 0) {\n      temp = (s64)a.dword[i - 2];\n    } else {\n      temp = ((s64)a.dword[i - 2] &gt;&gt; imm) +\n             (((s64)a.dword[i - 2] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.word[i] = clamp&lt;s64&gt;(temp, -2147483648, 2147483647);\n  }\n}\nfor (int i = 4; i &lt; 8; i++) {\n  if (i &lt; 6) {\n    s64 temp;\n    if (imm == 0) {\n      temp = (s64)b.dword[i - 2];\n    } else {\n      temp = ((s64)b.dword[i - 2] &gt;&gt; imm) +\n             (((s64)b.dword[i - 2] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.word[i] = clamp&lt;s64&gt;(temp, -2147483648, 2147483647);\n  } else {\n    s64 temp;\n    if (imm == 0) {\n      temp = (s64)a.dword[i - 4];\n    } else {\n      temp = ((s64)a.dword[i - 4] &gt;&gt; imm) +\n             (((s64)a.dword[i - 4] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.word[i] = clamp&lt;s64&gt;(temp, -2147483648, 2147483647);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvssrarni_wu_d (__m256i a, __m256i b, imm0_63 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvssrarni_wu_d (__m256i a, __m256i b, imm0_63 imm)\n#include <lasxintrin.h>\nInstruction: xvssrarni.wu.d xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nArithmetic right shift (with rounding) the signed 64-bit elements in `a` and `b` by `imm`, clamp to fit in unsigned 32-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (i < 2) {\n    s64 temp;\n    if (imm == 0) {\n      temp = (s64)b.dword[i];\n    } else {\n      temp = ((s64)b.dword[i] >> imm) + (((s64)b.dword[i] >> (imm - 1)) & 1);\n    }\n    dst.word[i] = clamp<s64>(temp, 0, 4294967295);\n  } else {\n    s64 temp;\n    if (imm == 0) {\n      temp = (s64)a.dword[i - 2];\n    } else {\n      temp = ((s64)a.dword[i - 2] >> imm) +\n             (((s64)a.dword[i - 2] >> (imm - 1)) & 1);\n    }\n    dst.word[i] = clamp<s64>(temp, 0, 4294967295);\n  }\n}\nfor (int i = 4; i < 8; i++) {\n  if (i < 6) {\n    s64 temp;\n    if (imm == 0) {\n      temp = (s64)b.dword[i - 2];\n    } else {\n      temp = ((s64)b.dword[i - 2] >> imm) +\n             (((s64)b.dword[i - 2] >> (imm - 1)) & 1);\n    }\n    dst.word[i] = clamp<s64>(temp, 0, 4294967295);\n  } else {\n    s64 temp;\n    if (imm == 0) {\n      temp = (s64)a.dword[i - 4];\n    } else {\n      temp = ((s64)a.dword[i - 4] >> imm) +\n             (((s64)a.dword[i - 4] >> (imm - 1)) & 1);\n    }\n    dst.word[i] = clamp<s64>(temp, 0, 4294967295);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvssrarni_wu_d (__m256i a, __m256i b, imm0_63 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvssrarni.wu.d xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Arithmetic right shift (with rounding) the signed 64-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in unsigned 32-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (i &lt; 2) {\n    s64 temp;\n    if (imm == 0) {\n      temp = (s64)b.dword[i];\n    } else {\n      temp = ((s64)b.dword[i] &gt;&gt; imm) + (((s64)b.dword[i] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.word[i] = clamp&lt;s64&gt;(temp, 0, 4294967295);\n  } else {\n    s64 temp;\n    if (imm == 0) {\n      temp = (s64)a.dword[i - 2];\n    } else {\n      temp = ((s64)a.dword[i - 2] &gt;&gt; imm) +\n             (((s64)a.dword[i - 2] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.word[i] = clamp&lt;s64&gt;(temp, 0, 4294967295);\n  }\n}\nfor (int i = 4; i &lt; 8; i++) {\n  if (i &lt; 6) {\n    s64 temp;\n    if (imm == 0) {\n      temp = (s64)b.dword[i - 2];\n    } else {\n      temp = ((s64)b.dword[i - 2] &gt;&gt; imm) +\n             (((s64)b.dword[i - 2] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.word[i] = clamp&lt;s64&gt;(temp, 0, 4294967295);\n  } else {\n    s64 temp;\n    if (imm == 0) {\n      temp = (s64)a.dword[i - 4];\n    } else {\n      temp = ((s64)a.dword[i - 4] &gt;&gt; imm) +\n             (((s64)a.dword[i - 4] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.word[i] = clamp&lt;s64&gt;(temp, 0, 4294967295);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvssrln_b_h (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvssrln_b_h (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvssrln.b.h xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nLogical right shift the unsigned 16-bit elements in `a` by elements in `b`, clamp to fit in signed 8-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  if (i < 8) {\n    u16 temp = (u16)a.half[i] >> (b.half[i] & 15);\n    dst.byte[i] = clamp<u16>(temp, 0, 127);\n  } else {\n    dst.byte[i] = 0;\n  }\n}\nfor (int i = 16; i < 32; i++) {\n  if (i < 24) {\n    u16 temp = (u16)a.half[i - 8] >> (b.half[i - 8] & 15);\n    dst.byte[i] = clamp<u16>(temp, 0, 127);\n  } else {\n    dst.byte[i] = 0;\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvssrln_b_h (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvssrln.b.h xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift the unsigned 16-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in signed 8-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  if (i &lt; 8) {\n    u16 temp = (u16)a.half[i] &gt;&gt; (b.half[i] &amp; 15);\n    dst.byte[i] = clamp&lt;u16&gt;(temp, 0, 127);\n  } else {\n    dst.byte[i] = 0;\n  }\n}\nfor (int i = 16; i &lt; 32; i++) {\n  if (i &lt; 24) {\n    u16 temp = (u16)a.half[i - 8] &gt;&gt; (b.half[i - 8] &amp; 15);\n    dst.byte[i] = clamp&lt;u16&gt;(temp, 0, 127);\n  } else {\n    dst.byte[i] = 0;\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvssrln_bu_h (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvssrln_bu_h (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvssrln.bu.h xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nLogical right shift the unsigned 16-bit elements in `a` by elements in `b`, clamp to fit in unsigned 8-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  if (i < 8) {\n    u16 temp = (u16)a.half[i] >> (b.half[i] & 15);\n    dst.byte[i] = clamp<u16>(temp, 0, 255);\n  } else {\n    dst.byte[i] = 0;\n  }\n}\nfor (int i = 16; i < 32; i++) {\n  if (i < 24) {\n    u16 temp = (u16)a.half[i - 8] >> (b.half[i - 8] & 15);\n    dst.byte[i] = clamp<u16>(temp, 0, 255);\n  } else {\n    dst.byte[i] = 0;\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvssrln_bu_h (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvssrln.bu.h xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift the unsigned 16-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in unsigned 8-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  if (i &lt; 8) {\n    u16 temp = (u16)a.half[i] &gt;&gt; (b.half[i] &amp; 15);\n    dst.byte[i] = clamp&lt;u16&gt;(temp, 0, 255);\n  } else {\n    dst.byte[i] = 0;\n  }\n}\nfor (int i = 16; i &lt; 32; i++) {\n  if (i &lt; 24) {\n    u16 temp = (u16)a.half[i - 8] &gt;&gt; (b.half[i - 8] &amp; 15);\n    dst.byte[i] = clamp&lt;u16&gt;(temp, 0, 255);\n  } else {\n    dst.byte[i] = 0;\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvssrln_h_w (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvssrln_h_w (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvssrln.h.w xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nLogical right shift the unsigned 32-bit elements in `a` by elements in `b`, clamp to fit in signed 16-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  if (i < 4) {\n    u32 temp = (u32)a.word[i] >> (b.word[i] & 31);\n    dst.half[i] = clamp<u32>(temp, 0, 32767);\n  } else {\n    dst.half[i] = 0;\n  }\n}\nfor (int i = 8; i < 16; i++) {\n  if (i < 12) {\n    u32 temp = (u32)a.word[i - 4] >> (b.word[i - 4] & 31);\n    dst.half[i] = clamp<u32>(temp, 0, 32767);\n  } else {\n    dst.half[i] = 0;\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvssrln_h_w (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvssrln.h.w xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift the unsigned 32-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in signed 16-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  if (i &lt; 4) {\n    u32 temp = (u32)a.word[i] &gt;&gt; (b.word[i] &amp; 31);\n    dst.half[i] = clamp&lt;u32&gt;(temp, 0, 32767);\n  } else {\n    dst.half[i] = 0;\n  }\n}\nfor (int i = 8; i &lt; 16; i++) {\n  if (i &lt; 12) {\n    u32 temp = (u32)a.word[i - 4] &gt;&gt; (b.word[i - 4] &amp; 31);\n    dst.half[i] = clamp&lt;u32&gt;(temp, 0, 32767);\n  } else {\n    dst.half[i] = 0;\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvssrln_hu_w (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvssrln_hu_w (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvssrln.hu.w xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nLogical right shift the unsigned 32-bit elements in `a` by elements in `b`, clamp to fit in unsigned 16-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  if (i < 4) {\n    u32 temp = (u32)a.word[i] >> (b.word[i] & 31);\n    dst.half[i] = clamp<u32>(temp, 0, 65535);\n  } else {\n    dst.half[i] = 0;\n  }\n}\nfor (int i = 8; i < 16; i++) {\n  if (i < 12) {\n    u32 temp = (u32)a.word[i - 4] >> (b.word[i - 4] & 31);\n    dst.half[i] = clamp<u32>(temp, 0, 65535);\n  } else {\n    dst.half[i] = 0;\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvssrln_hu_w (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvssrln.hu.w xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift the unsigned 32-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in unsigned 16-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  if (i &lt; 4) {\n    u32 temp = (u32)a.word[i] &gt;&gt; (b.word[i] &amp; 31);\n    dst.half[i] = clamp&lt;u32&gt;(temp, 0, 65535);\n  } else {\n    dst.half[i] = 0;\n  }\n}\nfor (int i = 8; i &lt; 16; i++) {\n  if (i &lt; 12) {\n    u32 temp = (u32)a.word[i - 4] &gt;&gt; (b.word[i - 4] &amp; 31);\n    dst.half[i] = clamp&lt;u32&gt;(temp, 0, 65535);\n  } else {\n    dst.half[i] = 0;\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvssrln_w_d (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvssrln_w_d (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvssrln.w.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nLogical right shift the unsigned 64-bit elements in `a` by elements in `b`, clamp to fit in signed 32-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (i < 2) {\n    u64 temp = (u64)a.dword[i] >> (b.dword[i] & 63);\n    dst.word[i] = clamp<u64>(temp, 0, 2147483647);\n  } else {\n    dst.word[i] = 0;\n  }\n}\nfor (int i = 4; i < 8; i++) {\n  if (i < 6) {\n    u64 temp = (u64)a.dword[i - 2] >> (b.dword[i - 2] & 63);\n    dst.word[i] = clamp<u64>(temp, 0, 2147483647);\n  } else {\n    dst.word[i] = 0;\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvssrln_w_d (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvssrln.w.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift the unsigned 64-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in signed 32-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (i &lt; 2) {\n    u64 temp = (u64)a.dword[i] &gt;&gt; (b.dword[i] &amp; 63);\n    dst.word[i] = clamp&lt;u64&gt;(temp, 0, 2147483647);\n  } else {\n    dst.word[i] = 0;\n  }\n}\nfor (int i = 4; i &lt; 8; i++) {\n  if (i &lt; 6) {\n    u64 temp = (u64)a.dword[i - 2] &gt;&gt; (b.dword[i - 2] &amp; 63);\n    dst.word[i] = clamp&lt;u64&gt;(temp, 0, 2147483647);\n  } else {\n    dst.word[i] = 0;\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvssrln_wu_d (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvssrln_wu_d (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvssrln.wu.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nLogical right shift the unsigned 64-bit elements in `a` by elements in `b`, clamp to fit in unsigned 32-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (i < 2) {\n    u64 temp = (u64)a.dword[i] >> (b.dword[i] & 63);\n    dst.word[i] = clamp<u64>(temp, 0, 4294967295);\n  } else {\n    dst.word[i] = 0;\n  }\n}\nfor (int i = 4; i < 8; i++) {\n  if (i < 6) {\n    u64 temp = (u64)a.dword[i - 2] >> (b.dword[i - 2] & 63);\n    dst.word[i] = clamp<u64>(temp, 0, 4294967295);\n  } else {\n    dst.word[i] = 0;\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvssrln_wu_d (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvssrln.wu.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift the unsigned 64-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in unsigned 32-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (i &lt; 2) {\n    u64 temp = (u64)a.dword[i] &gt;&gt; (b.dword[i] &amp; 63);\n    dst.word[i] = clamp&lt;u64&gt;(temp, 0, 4294967295);\n  } else {\n    dst.word[i] = 0;\n  }\n}\nfor (int i = 4; i &lt; 8; i++) {\n  if (i &lt; 6) {\n    u64 temp = (u64)a.dword[i - 2] &gt;&gt; (b.dword[i - 2] &amp; 63);\n    dst.word[i] = clamp&lt;u64&gt;(temp, 0, 4294967295);\n  } else {\n    dst.word[i] = 0;\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvssrlni_b_h (__m256i a, __m256i b, imm0_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvssrlni_b_h (__m256i a, __m256i b, imm0_15 imm)\n#include <lasxintrin.h>\nInstruction: xvssrlni.b.h xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nLogical right shift the unsigned 16-bit elements in `a` and `b` by `imm`, clamp to fit in signed 8-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  if (i < 8) {\n    u16 temp = (u16)b.half[i] >> imm;\n    dst.byte[i] = clamp<u16>(temp, 0, 127);\n  } else {\n    u16 temp = (u16)a.half[i - 8] >> imm;\n    dst.byte[i] = clamp<u16>(temp, 0, 127);\n  }\n}\nfor (int i = 16; i < 32; i++) {\n  if (i < 24) {\n    u16 temp = (u16)b.half[i - 8] >> imm;\n    dst.byte[i] = clamp<u16>(temp, 0, 127);\n  } else {\n    u16 temp = (u16)a.half[i - 16] >> imm;\n    dst.byte[i] = clamp<u16>(temp, 0, 127);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvssrlni_b_h (__m256i a, __m256i b, imm0_15 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvssrlni.b.h xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift the unsigned 16-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in signed 8-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  if (i &lt; 8) {\n    u16 temp = (u16)b.half[i] &gt;&gt; imm;\n    dst.byte[i] = clamp&lt;u16&gt;(temp, 0, 127);\n  } else {\n    u16 temp = (u16)a.half[i - 8] &gt;&gt; imm;\n    dst.byte[i] = clamp&lt;u16&gt;(temp, 0, 127);\n  }\n}\nfor (int i = 16; i &lt; 32; i++) {\n  if (i &lt; 24) {\n    u16 temp = (u16)b.half[i - 8] &gt;&gt; imm;\n    dst.byte[i] = clamp&lt;u16&gt;(temp, 0, 127);\n  } else {\n    u16 temp = (u16)a.half[i - 16] &gt;&gt; imm;\n    dst.byte[i] = clamp&lt;u16&gt;(temp, 0, 127);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvssrlni_bu_h (__m256i a, __m256i b, imm0_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvssrlni_bu_h (__m256i a, __m256i b, imm0_15 imm)\n#include <lasxintrin.h>\nInstruction: xvssrlni.bu.h xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nLogical right shift the unsigned 16-bit elements in `a` and `b` by `imm`, clamp to fit in unsigned 8-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  if (i < 8) {\n    u16 temp = (u16)b.half[i] >> imm;\n    dst.byte[i] = clamp<u16>(temp, 0, 255);\n  } else {\n    u16 temp = (u16)a.half[i - 8] >> imm;\n    dst.byte[i] = clamp<u16>(temp, 0, 255);\n  }\n}\nfor (int i = 16; i < 32; i++) {\n  if (i < 24) {\n    u16 temp = (u16)b.half[i - 8] >> imm;\n    dst.byte[i] = clamp<u16>(temp, 0, 255);\n  } else {\n    u16 temp = (u16)a.half[i - 16] >> imm;\n    dst.byte[i] = clamp<u16>(temp, 0, 255);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvssrlni_bu_h (__m256i a, __m256i b, imm0_15 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvssrlni.bu.h xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift the unsigned 16-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in unsigned 8-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  if (i &lt; 8) {\n    u16 temp = (u16)b.half[i] &gt;&gt; imm;\n    dst.byte[i] = clamp&lt;u16&gt;(temp, 0, 255);\n  } else {\n    u16 temp = (u16)a.half[i - 8] &gt;&gt; imm;\n    dst.byte[i] = clamp&lt;u16&gt;(temp, 0, 255);\n  }\n}\nfor (int i = 16; i &lt; 32; i++) {\n  if (i &lt; 24) {\n    u16 temp = (u16)b.half[i - 8] &gt;&gt; imm;\n    dst.byte[i] = clamp&lt;u16&gt;(temp, 0, 255);\n  } else {\n    u16 temp = (u16)a.half[i - 16] &gt;&gt; imm;\n    dst.byte[i] = clamp&lt;u16&gt;(temp, 0, 255);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvssrlni_d_q (__m256i a, __m256i b, imm0_127 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvssrlni_d_q (__m256i a, __m256i b, imm0_127 imm)\n#include <lasxintrin.h>\nInstruction: xvssrlni.d.q xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nLogical right shift the unsigned 128-bit elements in `a` and `b` by `imm`, clamp to fit in signed 64-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  if (i < 1) {\n    u128 temp = (u128)b.qword[i] >> imm;\n    dst.dword[i] = clamp<u128>(temp, 0, 9223372036854775807);\n  } else {\n    u128 temp = (u128)a.qword[i - 1] >> imm;\n    dst.dword[i] = clamp<u128>(temp, 0, 9223372036854775807);\n  }\n}\nfor (int i = 2; i < 4; i++) {\n  if (i < 3) {\n    u128 temp = (u128)b.qword[i - 1] >> imm;\n    dst.dword[i] = clamp<u128>(temp, 0, 9223372036854775807);\n  } else {\n    u128 temp = (u128)a.qword[i - 2] >> imm;\n    dst.dword[i] = clamp<u128>(temp, 0, 9223372036854775807);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvssrlni_d_q (__m256i a, __m256i b, imm0_127 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvssrlni.d.q xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift the unsigned 128-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in signed 64-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  if (i &lt; 1) {\n    u128 temp = (u128)b.qword[i] &gt;&gt; imm;\n    dst.dword[i] = clamp&lt;u128&gt;(temp, 0, 9223372036854775807);\n  } else {\n    u128 temp = (u128)a.qword[i - 1] &gt;&gt; imm;\n    dst.dword[i] = clamp&lt;u128&gt;(temp, 0, 9223372036854775807);\n  }\n}\nfor (int i = 2; i &lt; 4; i++) {\n  if (i &lt; 3) {\n    u128 temp = (u128)b.qword[i - 1] &gt;&gt; imm;\n    dst.dword[i] = clamp&lt;u128&gt;(temp, 0, 9223372036854775807);\n  } else {\n    u128 temp = (u128)a.qword[i - 2] &gt;&gt; imm;\n    dst.dword[i] = clamp&lt;u128&gt;(temp, 0, 9223372036854775807);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvssrlni_du_q (__m256i a, __m256i b, imm0_127 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvssrlni_du_q (__m256i a, __m256i b, imm0_127 imm)\n#include <lasxintrin.h>\nInstruction: xvssrlni.du.q xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nLogical right shift the unsigned 128-bit elements in `a` and `b` by `imm`, clamp to fit in unsigned 64-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  if (i < 1) {\n    u128 temp = (u128)b.qword[i] >> imm;\n    dst.dword[i] = clamp<u128>(temp, 0, 18446744073709551615);\n  } else {\n    u128 temp = (u128)a.qword[i - 1] >> imm;\n    dst.dword[i] = clamp<u128>(temp, 0, 18446744073709551615);\n  }\n}\nfor (int i = 2; i < 4; i++) {\n  if (i < 3) {\n    u128 temp = (u128)b.qword[i - 1] >> imm;\n    dst.dword[i] = clamp<u128>(temp, 0, 18446744073709551615);\n  } else {\n    u128 temp = (u128)a.qword[i - 2] >> imm;\n    dst.dword[i] = clamp<u128>(temp, 0, 18446744073709551615);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvssrlni_du_q (__m256i a, __m256i b, imm0_127 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvssrlni.du.q xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift the unsigned 128-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in unsigned 64-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  if (i &lt; 1) {\n    u128 temp = (u128)b.qword[i] &gt;&gt; imm;\n    dst.dword[i] = clamp&lt;u128&gt;(temp, 0, 18446744073709551615);\n  } else {\n    u128 temp = (u128)a.qword[i - 1] &gt;&gt; imm;\n    dst.dword[i] = clamp&lt;u128&gt;(temp, 0, 18446744073709551615);\n  }\n}\nfor (int i = 2; i &lt; 4; i++) {\n  if (i &lt; 3) {\n    u128 temp = (u128)b.qword[i - 1] &gt;&gt; imm;\n    dst.dword[i] = clamp&lt;u128&gt;(temp, 0, 18446744073709551615);\n  } else {\n    u128 temp = (u128)a.qword[i - 2] &gt;&gt; imm;\n    dst.dword[i] = clamp&lt;u128&gt;(temp, 0, 18446744073709551615);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvssrlni_h_w (__m256i a, __m256i b, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvssrlni_h_w (__m256i a, __m256i b, imm0_31 imm)\n#include <lasxintrin.h>\nInstruction: xvssrlni.h.w xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nLogical right shift the unsigned 32-bit elements in `a` and `b` by `imm`, clamp to fit in signed 16-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  if (i < 4) {\n    u32 temp = (u32)b.word[i] >> imm;\n    dst.half[i] = clamp<u32>(temp, 0, 32767);\n  } else {\n    u32 temp = (u32)a.word[i - 4] >> imm;\n    dst.half[i] = clamp<u32>(temp, 0, 32767);\n  }\n}\nfor (int i = 8; i < 16; i++) {\n  if (i < 12) {\n    u32 temp = (u32)b.word[i - 4] >> imm;\n    dst.half[i] = clamp<u32>(temp, 0, 32767);\n  } else {\n    u32 temp = (u32)a.word[i - 8] >> imm;\n    dst.half[i] = clamp<u32>(temp, 0, 32767);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvssrlni_h_w (__m256i a, __m256i b, imm0_31 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvssrlni.h.w xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift the unsigned 32-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in signed 16-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  if (i &lt; 4) {\n    u32 temp = (u32)b.word[i] &gt;&gt; imm;\n    dst.half[i] = clamp&lt;u32&gt;(temp, 0, 32767);\n  } else {\n    u32 temp = (u32)a.word[i - 4] &gt;&gt; imm;\n    dst.half[i] = clamp&lt;u32&gt;(temp, 0, 32767);\n  }\n}\nfor (int i = 8; i &lt; 16; i++) {\n  if (i &lt; 12) {\n    u32 temp = (u32)b.word[i - 4] &gt;&gt; imm;\n    dst.half[i] = clamp&lt;u32&gt;(temp, 0, 32767);\n  } else {\n    u32 temp = (u32)a.word[i - 8] &gt;&gt; imm;\n    dst.half[i] = clamp&lt;u32&gt;(temp, 0, 32767);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvssrlni_hu_w (__m256i a, __m256i b, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvssrlni_hu_w (__m256i a, __m256i b, imm0_31 imm)\n#include <lasxintrin.h>\nInstruction: xvssrlni.hu.w xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nLogical right shift the unsigned 32-bit elements in `a` and `b` by `imm`, clamp to fit in unsigned 16-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  if (i < 4) {\n    u32 temp = (u32)b.word[i] >> imm;\n    dst.half[i] = clamp<u32>(temp, 0, 65535);\n  } else {\n    u32 temp = (u32)a.word[i - 4] >> imm;\n    dst.half[i] = clamp<u32>(temp, 0, 65535);\n  }\n}\nfor (int i = 8; i < 16; i++) {\n  if (i < 12) {\n    u32 temp = (u32)b.word[i - 4] >> imm;\n    dst.half[i] = clamp<u32>(temp, 0, 65535);\n  } else {\n    u32 temp = (u32)a.word[i - 8] >> imm;\n    dst.half[i] = clamp<u32>(temp, 0, 65535);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvssrlni_hu_w (__m256i a, __m256i b, imm0_31 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvssrlni.hu.w xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift the unsigned 32-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in unsigned 16-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  if (i &lt; 4) {\n    u32 temp = (u32)b.word[i] &gt;&gt; imm;\n    dst.half[i] = clamp&lt;u32&gt;(temp, 0, 65535);\n  } else {\n    u32 temp = (u32)a.word[i - 4] &gt;&gt; imm;\n    dst.half[i] = clamp&lt;u32&gt;(temp, 0, 65535);\n  }\n}\nfor (int i = 8; i &lt; 16; i++) {\n  if (i &lt; 12) {\n    u32 temp = (u32)b.word[i - 4] &gt;&gt; imm;\n    dst.half[i] = clamp&lt;u32&gt;(temp, 0, 65535);\n  } else {\n    u32 temp = (u32)a.word[i - 8] &gt;&gt; imm;\n    dst.half[i] = clamp&lt;u32&gt;(temp, 0, 65535);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvssrlni_w_d (__m256i a, __m256i b, imm0_63 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvssrlni_w_d (__m256i a, __m256i b, imm0_63 imm)\n#include <lasxintrin.h>\nInstruction: xvssrlni.w.d xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nLogical right shift the unsigned 64-bit elements in `a` and `b` by `imm`, clamp to fit in signed 32-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (i < 2) {\n    u64 temp = (u64)b.dword[i] >> imm;\n    dst.word[i] = clamp<u64>(temp, 0, 2147483647);\n  } else {\n    u64 temp = (u64)a.dword[i - 2] >> imm;\n    dst.word[i] = clamp<u64>(temp, 0, 2147483647);\n  }\n}\nfor (int i = 4; i < 8; i++) {\n  if (i < 6) {\n    u64 temp = (u64)b.dword[i - 2] >> imm;\n    dst.word[i] = clamp<u64>(temp, 0, 2147483647);\n  } else {\n    u64 temp = (u64)a.dword[i - 4] >> imm;\n    dst.word[i] = clamp<u64>(temp, 0, 2147483647);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvssrlni_w_d (__m256i a, __m256i b, imm0_63 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvssrlni.w.d xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift the unsigned 64-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in signed 32-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (i &lt; 2) {\n    u64 temp = (u64)b.dword[i] &gt;&gt; imm;\n    dst.word[i] = clamp&lt;u64&gt;(temp, 0, 2147483647);\n  } else {\n    u64 temp = (u64)a.dword[i - 2] &gt;&gt; imm;\n    dst.word[i] = clamp&lt;u64&gt;(temp, 0, 2147483647);\n  }\n}\nfor (int i = 4; i &lt; 8; i++) {\n  if (i &lt; 6) {\n    u64 temp = (u64)b.dword[i - 2] &gt;&gt; imm;\n    dst.word[i] = clamp&lt;u64&gt;(temp, 0, 2147483647);\n  } else {\n    u64 temp = (u64)a.dword[i - 4] &gt;&gt; imm;\n    dst.word[i] = clamp&lt;u64&gt;(temp, 0, 2147483647);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvssrlni_wu_d (__m256i a, __m256i b, imm0_63 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvssrlni_wu_d (__m256i a, __m256i b, imm0_63 imm)\n#include <lasxintrin.h>\nInstruction: xvssrlni.wu.d xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nLogical right shift the unsigned 64-bit elements in `a` and `b` by `imm`, clamp to fit in unsigned 32-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (i < 2) {\n    u64 temp = (u64)b.dword[i] >> imm;\n    dst.word[i] = clamp<u64>(temp, 0, 4294967295);\n  } else {\n    u64 temp = (u64)a.dword[i - 2] >> imm;\n    dst.word[i] = clamp<u64>(temp, 0, 4294967295);\n  }\n}\nfor (int i = 4; i < 8; i++) {\n  if (i < 6) {\n    u64 temp = (u64)b.dword[i - 2] >> imm;\n    dst.word[i] = clamp<u64>(temp, 0, 4294967295);\n  } else {\n    u64 temp = (u64)a.dword[i - 4] >> imm;\n    dst.word[i] = clamp<u64>(temp, 0, 4294967295);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvssrlni_wu_d (__m256i a, __m256i b, imm0_63 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvssrlni.wu.d xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift the unsigned 64-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in unsigned 32-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (i &lt; 2) {\n    u64 temp = (u64)b.dword[i] &gt;&gt; imm;\n    dst.word[i] = clamp&lt;u64&gt;(temp, 0, 4294967295);\n  } else {\n    u64 temp = (u64)a.dword[i - 2] &gt;&gt; imm;\n    dst.word[i] = clamp&lt;u64&gt;(temp, 0, 4294967295);\n  }\n}\nfor (int i = 4; i &lt; 8; i++) {\n  if (i &lt; 6) {\n    u64 temp = (u64)b.dword[i - 2] &gt;&gt; imm;\n    dst.word[i] = clamp&lt;u64&gt;(temp, 0, 4294967295);\n  } else {\n    u64 temp = (u64)a.dword[i - 4] &gt;&gt; imm;\n    dst.word[i] = clamp&lt;u64&gt;(temp, 0, 4294967295);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvssrlrn_b_h (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvssrlrn_b_h (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvssrlrn.b.h xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nLogical right shift (with rounding) the unsigned 16-bit elements in `a` by elements in `b`, clamp to fit in signed 8-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  if (i < 8) {\n    u16 temp;\n    if ((b.half[i] & 15) == 0) {\n      temp = (u16)a.half[i];\n    } else {\n      temp = ((u16)a.half[i] >> (b.half[i] & 15)) +\n             (((u16)a.half[i] >> ((b.half[i] & 15) - 1)) & 1);\n    }\n    dst.byte[i] = clamp<u16>(temp, 0, 127);\n  } else {\n    dst.byte[i] = 0;\n  }\n}\nfor (int i = 16; i < 32; i++) {\n  if (i < 24) {\n    u16 temp;\n    if ((b.half[i - 8] & 15) == 0) {\n      temp = (u16)a.half[i - 8];\n    } else {\n      temp = ((u16)a.half[i - 8] >> (b.half[i - 8] & 15)) +\n             (((u16)a.half[i - 8] >> ((b.half[i - 8] & 15) - 1)) & 1);\n    }\n    dst.byte[i] = clamp<u16>(temp, 0, 127);\n  } else {\n    dst.byte[i] = 0;\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvssrlrn_b_h (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvssrlrn.b.h xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift (with rounding) the unsigned 16-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in signed 8-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  if (i &lt; 8) {\n    u16 temp;\n    if ((b.half[i] &amp; 15) == 0) {\n      temp = (u16)a.half[i];\n    } else {\n      temp = ((u16)a.half[i] &gt;&gt; (b.half[i] &amp; 15)) +\n             (((u16)a.half[i] &gt;&gt; ((b.half[i] &amp; 15) - 1)) &amp; 1);\n    }\n    dst.byte[i] = clamp&lt;u16&gt;(temp, 0, 127);\n  } else {\n    dst.byte[i] = 0;\n  }\n}\nfor (int i = 16; i &lt; 32; i++) {\n  if (i &lt; 24) {\n    u16 temp;\n    if ((b.half[i - 8] &amp; 15) == 0) {\n      temp = (u16)a.half[i - 8];\n    } else {\n      temp = ((u16)a.half[i - 8] &gt;&gt; (b.half[i - 8] &amp; 15)) +\n             (((u16)a.half[i - 8] &gt;&gt; ((b.half[i - 8] &amp; 15) - 1)) &amp; 1);\n    }\n    dst.byte[i] = clamp&lt;u16&gt;(temp, 0, 127);\n  } else {\n    dst.byte[i] = 0;\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvssrlrn_bu_h (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvssrlrn_bu_h (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvssrlrn.bu.h xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nLogical right shift (with rounding) the unsigned 16-bit elements in `a` by elements in `b`, clamp to fit in unsigned 8-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  if (i < 8) {\n    u16 temp;\n    if ((b.half[i] & 15) == 0) {\n      temp = (u16)a.half[i];\n    } else {\n      temp = ((u16)a.half[i] >> (b.half[i] & 15)) +\n             (((u16)a.half[i] >> ((b.half[i] & 15) - 1)) & 1);\n    }\n    dst.byte[i] = clamp<u16>(temp, 0, 255);\n  } else {\n    dst.byte[i] = 0;\n  }\n}\nfor (int i = 16; i < 32; i++) {\n  if (i < 24) {\n    u16 temp;\n    if ((b.half[i - 8] & 15) == 0) {\n      temp = (u16)a.half[i - 8];\n    } else {\n      temp = ((u16)a.half[i - 8] >> (b.half[i - 8] & 15)) +\n             (((u16)a.half[i - 8] >> ((b.half[i - 8] & 15) - 1)) & 1);\n    }\n    dst.byte[i] = clamp<u16>(temp, 0, 255);\n  } else {\n    dst.byte[i] = 0;\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvssrlrn_bu_h (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvssrlrn.bu.h xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift (with rounding) the unsigned 16-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in unsigned 8-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  if (i &lt; 8) {\n    u16 temp;\n    if ((b.half[i] &amp; 15) == 0) {\n      temp = (u16)a.half[i];\n    } else {\n      temp = ((u16)a.half[i] &gt;&gt; (b.half[i] &amp; 15)) +\n             (((u16)a.half[i] &gt;&gt; ((b.half[i] &amp; 15) - 1)) &amp; 1);\n    }\n    dst.byte[i] = clamp&lt;u16&gt;(temp, 0, 255);\n  } else {\n    dst.byte[i] = 0;\n  }\n}\nfor (int i = 16; i &lt; 32; i++) {\n  if (i &lt; 24) {\n    u16 temp;\n    if ((b.half[i - 8] &amp; 15) == 0) {\n      temp = (u16)a.half[i - 8];\n    } else {\n      temp = ((u16)a.half[i - 8] &gt;&gt; (b.half[i - 8] &amp; 15)) +\n             (((u16)a.half[i - 8] &gt;&gt; ((b.half[i - 8] &amp; 15) - 1)) &amp; 1);\n    }\n    dst.byte[i] = clamp&lt;u16&gt;(temp, 0, 255);\n  } else {\n    dst.byte[i] = 0;\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvssrlrn_h_w (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvssrlrn_h_w (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvssrlrn.h.w xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nLogical right shift (with rounding) the unsigned 32-bit elements in `a` by elements in `b`, clamp to fit in signed 16-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  if (i < 4) {\n    u32 temp;\n    if ((b.word[i] & 31) == 0) {\n      temp = (u32)a.word[i];\n    } else {\n      temp = ((u32)a.word[i] >> (b.word[i] & 31)) +\n             (((u32)a.word[i] >> ((b.word[i] & 31) - 1)) & 1);\n    }\n    dst.half[i] = clamp<u32>(temp, 0, 32767);\n  } else {\n    dst.half[i] = 0;\n  }\n}\nfor (int i = 8; i < 16; i++) {\n  if (i < 12) {\n    u32 temp;\n    if ((b.word[i - 4] & 31) == 0) {\n      temp = (u32)a.word[i - 4];\n    } else {\n      temp = ((u32)a.word[i - 4] >> (b.word[i - 4] & 31)) +\n             (((u32)a.word[i - 4] >> ((b.word[i - 4] & 31) - 1)) & 1);\n    }\n    dst.half[i] = clamp<u32>(temp, 0, 32767);\n  } else {\n    dst.half[i] = 0;\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvssrlrn_h_w (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvssrlrn.h.w xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift (with rounding) the unsigned 32-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in signed 16-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  if (i &lt; 4) {\n    u32 temp;\n    if ((b.word[i] &amp; 31) == 0) {\n      temp = (u32)a.word[i];\n    } else {\n      temp = ((u32)a.word[i] &gt;&gt; (b.word[i] &amp; 31)) +\n             (((u32)a.word[i] &gt;&gt; ((b.word[i] &amp; 31) - 1)) &amp; 1);\n    }\n    dst.half[i] = clamp&lt;u32&gt;(temp, 0, 32767);\n  } else {\n    dst.half[i] = 0;\n  }\n}\nfor (int i = 8; i &lt; 16; i++) {\n  if (i &lt; 12) {\n    u32 temp;\n    if ((b.word[i - 4] &amp; 31) == 0) {\n      temp = (u32)a.word[i - 4];\n    } else {\n      temp = ((u32)a.word[i - 4] &gt;&gt; (b.word[i - 4] &amp; 31)) +\n             (((u32)a.word[i - 4] &gt;&gt; ((b.word[i - 4] &amp; 31) - 1)) &amp; 1);\n    }\n    dst.half[i] = clamp&lt;u32&gt;(temp, 0, 32767);\n  } else {\n    dst.half[i] = 0;\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvssrlrn_hu_w (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvssrlrn_hu_w (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvssrlrn.hu.w xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nLogical right shift (with rounding) the unsigned 32-bit elements in `a` by elements in `b`, clamp to fit in unsigned 16-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  if (i < 4) {\n    u32 temp;\n    if ((b.word[i] & 31) == 0) {\n      temp = (u32)a.word[i];\n    } else {\n      temp = ((u32)a.word[i] >> (b.word[i] & 31)) +\n             (((u32)a.word[i] >> ((b.word[i] & 31) - 1)) & 1);\n    }\n    dst.half[i] = clamp<u32>(temp, 0, 65535);\n  } else {\n    dst.half[i] = 0;\n  }\n}\nfor (int i = 8; i < 16; i++) {\n  if (i < 12) {\n    u32 temp;\n    if ((b.word[i - 4] & 31) == 0) {\n      temp = (u32)a.word[i - 4];\n    } else {\n      temp = ((u32)a.word[i - 4] >> (b.word[i - 4] & 31)) +\n             (((u32)a.word[i - 4] >> ((b.word[i - 4] & 31) - 1)) & 1);\n    }\n    dst.half[i] = clamp<u32>(temp, 0, 65535);\n  } else {\n    dst.half[i] = 0;\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvssrlrn_hu_w (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvssrlrn.hu.w xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift (with rounding) the unsigned 32-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in unsigned 16-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  if (i &lt; 4) {\n    u32 temp;\n    if ((b.word[i] &amp; 31) == 0) {\n      temp = (u32)a.word[i];\n    } else {\n      temp = ((u32)a.word[i] &gt;&gt; (b.word[i] &amp; 31)) +\n             (((u32)a.word[i] &gt;&gt; ((b.word[i] &amp; 31) - 1)) &amp; 1);\n    }\n    dst.half[i] = clamp&lt;u32&gt;(temp, 0, 65535);\n  } else {\n    dst.half[i] = 0;\n  }\n}\nfor (int i = 8; i &lt; 16; i++) {\n  if (i &lt; 12) {\n    u32 temp;\n    if ((b.word[i - 4] &amp; 31) == 0) {\n      temp = (u32)a.word[i - 4];\n    } else {\n      temp = ((u32)a.word[i - 4] &gt;&gt; (b.word[i - 4] &amp; 31)) +\n             (((u32)a.word[i - 4] &gt;&gt; ((b.word[i - 4] &amp; 31) - 1)) &amp; 1);\n    }\n    dst.half[i] = clamp&lt;u32&gt;(temp, 0, 65535);\n  } else {\n    dst.half[i] = 0;\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvssrlrn_w_d (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvssrlrn_w_d (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvssrlrn.w.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nLogical right shift (with rounding) the unsigned 64-bit elements in `a` by elements in `b`, clamp to fit in signed 32-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (i < 2) {\n    u64 temp;\n    if ((b.dword[i] & 63) == 0) {\n      temp = (u64)a.dword[i];\n    } else {\n      temp = ((u64)a.dword[i] >> (b.dword[i] & 63)) +\n             (((u64)a.dword[i] >> ((b.dword[i] & 63) - 1)) & 1);\n    }\n    dst.word[i] = clamp<u64>(temp, 0, 2147483647);\n  } else {\n    dst.word[i] = 0;\n  }\n}\nfor (int i = 4; i < 8; i++) {\n  if (i < 6) {\n    u64 temp;\n    if ((b.dword[i - 2] & 63) == 0) {\n      temp = (u64)a.dword[i - 2];\n    } else {\n      temp = ((u64)a.dword[i - 2] >> (b.dword[i - 2] & 63)) +\n             (((u64)a.dword[i - 2] >> ((b.dword[i - 2] & 63) - 1)) & 1);\n    }\n    dst.word[i] = clamp<u64>(temp, 0, 2147483647);\n  } else {\n    dst.word[i] = 0;\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvssrlrn_w_d (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvssrlrn.w.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift (with rounding) the unsigned 64-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in signed 32-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (i &lt; 2) {\n    u64 temp;\n    if ((b.dword[i] &amp; 63) == 0) {\n      temp = (u64)a.dword[i];\n    } else {\n      temp = ((u64)a.dword[i] &gt;&gt; (b.dword[i] &amp; 63)) +\n             (((u64)a.dword[i] &gt;&gt; ((b.dword[i] &amp; 63) - 1)) &amp; 1);\n    }\n    dst.word[i] = clamp&lt;u64&gt;(temp, 0, 2147483647);\n  } else {\n    dst.word[i] = 0;\n  }\n}\nfor (int i = 4; i &lt; 8; i++) {\n  if (i &lt; 6) {\n    u64 temp;\n    if ((b.dword[i - 2] &amp; 63) == 0) {\n      temp = (u64)a.dword[i - 2];\n    } else {\n      temp = ((u64)a.dword[i - 2] &gt;&gt; (b.dword[i - 2] &amp; 63)) +\n             (((u64)a.dword[i - 2] &gt;&gt; ((b.dword[i - 2] &amp; 63) - 1)) &amp; 1);\n    }\n    dst.word[i] = clamp&lt;u64&gt;(temp, 0, 2147483647);\n  } else {\n    dst.word[i] = 0;\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvssrlrn_wu_d (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvssrlrn_wu_d (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvssrlrn.wu.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nLogical right shift (with rounding) the unsigned 64-bit elements in `a` by elements in `b`, clamp to fit in unsigned 32-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (i < 2) {\n    u64 temp;\n    if ((b.dword[i] & 63) == 0) {\n      temp = (u64)a.dword[i];\n    } else {\n      temp = ((u64)a.dword[i] >> (b.dword[i] & 63)) +\n             (((u64)a.dword[i] >> ((b.dword[i] & 63) - 1)) & 1);\n    }\n    dst.word[i] = clamp<u64>(temp, 0, 4294967295);\n  } else {\n    dst.word[i] = 0;\n  }\n}\nfor (int i = 4; i < 8; i++) {\n  if (i < 6) {\n    u64 temp;\n    if ((b.dword[i - 2] & 63) == 0) {\n      temp = (u64)a.dword[i - 2];\n    } else {\n      temp = ((u64)a.dword[i - 2] >> (b.dword[i - 2] & 63)) +\n             (((u64)a.dword[i - 2] >> ((b.dword[i - 2] & 63) - 1)) & 1);\n    }\n    dst.word[i] = clamp<u64>(temp, 0, 4294967295);\n  } else {\n    dst.word[i] = 0;\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvssrlrn_wu_d (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvssrlrn.wu.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift (with rounding) the unsigned 64-bit elements in <code>a</code> by elements in <code>b</code>, clamp to fit in unsigned 32-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (i &lt; 2) {\n    u64 temp;\n    if ((b.dword[i] &amp; 63) == 0) {\n      temp = (u64)a.dword[i];\n    } else {\n      temp = ((u64)a.dword[i] &gt;&gt; (b.dword[i] &amp; 63)) +\n             (((u64)a.dword[i] &gt;&gt; ((b.dword[i] &amp; 63) - 1)) &amp; 1);\n    }\n    dst.word[i] = clamp&lt;u64&gt;(temp, 0, 4294967295);\n  } else {\n    dst.word[i] = 0;\n  }\n}\nfor (int i = 4; i &lt; 8; i++) {\n  if (i &lt; 6) {\n    u64 temp;\n    if ((b.dword[i - 2] &amp; 63) == 0) {\n      temp = (u64)a.dword[i - 2];\n    } else {\n      temp = ((u64)a.dword[i - 2] &gt;&gt; (b.dword[i - 2] &amp; 63)) +\n             (((u64)a.dword[i - 2] &gt;&gt; ((b.dword[i - 2] &amp; 63) - 1)) &amp; 1);\n    }\n    dst.word[i] = clamp&lt;u64&gt;(temp, 0, 4294967295);\n  } else {\n    dst.word[i] = 0;\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvssrlrni_b_h (__m256i a, __m256i b, imm0_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvssrlrni_b_h (__m256i a, __m256i b, imm0_15 imm)\n#include <lasxintrin.h>\nInstruction: xvssrlrni.b.h xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nLogical right shift (with rounding) the unsigned 16-bit elements in `a` and `b` by `imm`, clamp to fit in signed 8-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  if (i < 8) {\n    u16 temp;\n    if (imm == 0) {\n      temp = (u16)b.half[i];\n    } else {\n      temp = ((u16)b.half[i] >> imm) + (((u16)b.half[i] >> (imm - 1)) & 1);\n    }\n    dst.byte[i] = clamp<u16>(temp, 0, 127);\n  } else {\n    u16 temp;\n    if (imm == 0) {\n      temp = (u16)a.half[i - 8];\n    } else {\n      temp =\n          ((u16)a.half[i - 8] >> imm) + (((u16)a.half[i - 8] >> (imm - 1)) & 1);\n    }\n    dst.byte[i] = clamp<u16>(temp, 0, 127);\n  }\n}\nfor (int i = 16; i < 32; i++) {\n  if (i < 24) {\n    u16 temp;\n    if (imm == 0) {\n      temp = (u16)b.half[i - 8];\n    } else {\n      temp =\n          ((u16)b.half[i - 8] >> imm) + (((u16)b.half[i - 8] >> (imm - 1)) & 1);\n    }\n    dst.byte[i] = clamp<u16>(temp, 0, 127);\n  } else {\n    u16 temp;\n    if (imm == 0) {\n      temp = (u16)a.half[i - 16];\n    } else {\n      temp = ((u16)a.half[i - 16] >> imm) +\n             (((u16)a.half[i - 16] >> (imm - 1)) & 1);\n    }\n    dst.byte[i] = clamp<u16>(temp, 0, 127);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvssrlrni_b_h (__m256i a, __m256i b, imm0_15 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvssrlrni.b.h xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift (with rounding) the unsigned 16-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in signed 8-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  if (i &lt; 8) {\n    u16 temp;\n    if (imm == 0) {\n      temp = (u16)b.half[i];\n    } else {\n      temp = ((u16)b.half[i] &gt;&gt; imm) + (((u16)b.half[i] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.byte[i] = clamp&lt;u16&gt;(temp, 0, 127);\n  } else {\n    u16 temp;\n    if (imm == 0) {\n      temp = (u16)a.half[i - 8];\n    } else {\n      temp =\n          ((u16)a.half[i - 8] &gt;&gt; imm) + (((u16)a.half[i - 8] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.byte[i] = clamp&lt;u16&gt;(temp, 0, 127);\n  }\n}\nfor (int i = 16; i &lt; 32; i++) {\n  if (i &lt; 24) {\n    u16 temp;\n    if (imm == 0) {\n      temp = (u16)b.half[i - 8];\n    } else {\n      temp =\n          ((u16)b.half[i - 8] &gt;&gt; imm) + (((u16)b.half[i - 8] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.byte[i] = clamp&lt;u16&gt;(temp, 0, 127);\n  } else {\n    u16 temp;\n    if (imm == 0) {\n      temp = (u16)a.half[i - 16];\n    } else {\n      temp = ((u16)a.half[i - 16] &gt;&gt; imm) +\n             (((u16)a.half[i - 16] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.byte[i] = clamp&lt;u16&gt;(temp, 0, 127);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvssrlrni_bu_h (__m256i a, __m256i b, imm0_15 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvssrlrni_bu_h (__m256i a, __m256i b, imm0_15 imm)\n#include <lasxintrin.h>\nInstruction: xvssrlrni.bu.h xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nLogical right shift (with rounding) the unsigned 16-bit elements in `a` and `b` by `imm`, clamp to fit in unsigned 8-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  if (i < 8) {\n    u16 temp;\n    if (imm == 0) {\n      temp = (u16)b.half[i];\n    } else {\n      temp = ((u16)b.half[i] >> imm) + (((u16)b.half[i] >> (imm - 1)) & 1);\n    }\n    dst.byte[i] = clamp<u16>(temp, 0, 255);\n  } else {\n    u16 temp;\n    if (imm == 0) {\n      temp = (u16)a.half[i - 8];\n    } else {\n      temp =\n          ((u16)a.half[i - 8] >> imm) + (((u16)a.half[i - 8] >> (imm - 1)) & 1);\n    }\n    dst.byte[i] = clamp<u16>(temp, 0, 255);\n  }\n}\nfor (int i = 16; i < 32; i++) {\n  if (i < 24) {\n    u16 temp;\n    if (imm == 0) {\n      temp = (u16)b.half[i - 8];\n    } else {\n      temp =\n          ((u16)b.half[i - 8] >> imm) + (((u16)b.half[i - 8] >> (imm - 1)) & 1);\n    }\n    dst.byte[i] = clamp<u16>(temp, 0, 255);\n  } else {\n    u16 temp;\n    if (imm == 0) {\n      temp = (u16)a.half[i - 16];\n    } else {\n      temp = ((u16)a.half[i - 16] >> imm) +\n             (((u16)a.half[i - 16] >> (imm - 1)) & 1);\n    }\n    dst.byte[i] = clamp<u16>(temp, 0, 255);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvssrlrni_bu_h (__m256i a, __m256i b, imm0_15 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvssrlrni.bu.h xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift (with rounding) the unsigned 16-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in unsigned 8-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  if (i &lt; 8) {\n    u16 temp;\n    if (imm == 0) {\n      temp = (u16)b.half[i];\n    } else {\n      temp = ((u16)b.half[i] &gt;&gt; imm) + (((u16)b.half[i] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.byte[i] = clamp&lt;u16&gt;(temp, 0, 255);\n  } else {\n    u16 temp;\n    if (imm == 0) {\n      temp = (u16)a.half[i - 8];\n    } else {\n      temp =\n          ((u16)a.half[i - 8] &gt;&gt; imm) + (((u16)a.half[i - 8] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.byte[i] = clamp&lt;u16&gt;(temp, 0, 255);\n  }\n}\nfor (int i = 16; i &lt; 32; i++) {\n  if (i &lt; 24) {\n    u16 temp;\n    if (imm == 0) {\n      temp = (u16)b.half[i - 8];\n    } else {\n      temp =\n          ((u16)b.half[i - 8] &gt;&gt; imm) + (((u16)b.half[i - 8] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.byte[i] = clamp&lt;u16&gt;(temp, 0, 255);\n  } else {\n    u16 temp;\n    if (imm == 0) {\n      temp = (u16)a.half[i - 16];\n    } else {\n      temp = ((u16)a.half[i - 16] &gt;&gt; imm) +\n             (((u16)a.half[i - 16] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.byte[i] = clamp&lt;u16&gt;(temp, 0, 255);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvssrlrni_d_q (__m256i a, __m256i b, imm0_127 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvssrlrni_d_q (__m256i a, __m256i b, imm0_127 imm)\n#include <lasxintrin.h>\nInstruction: xvssrlrni.d.q xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nLogical right shift (with rounding) the unsigned 128-bit elements in `a` and `b` by `imm`, clamp to fit in signed 64-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  if (i < 1) {\n    u128 temp;\n    if (imm == 0) {\n      temp = (u128)b.qword[i];\n    } else {\n      temp = ((u128)b.qword[i] >> imm) + (((u128)b.qword[i] >> (imm - 1)) & 1);\n    }\n    dst.dword[i] = clamp<u128>(temp, 0, 9223372036854775807);\n  } else {\n    u128 temp;\n    if (imm == 0) {\n      temp = (u128)a.qword[i - 1];\n    } else {\n      temp = ((u128)a.qword[i - 1] >> imm) +\n             (((u128)a.qword[i - 1] >> (imm - 1)) & 1);\n    }\n    dst.dword[i] = clamp<u128>(temp, 0, 9223372036854775807);\n  }\n}\nfor (int i = 2; i < 4; i++) {\n  if (i < 3) {\n    u128 temp;\n    if (imm == 0) {\n      temp = (u128)b.qword[i - 1];\n    } else {\n      temp = ((u128)b.qword[i - 1] >> imm) +\n             (((u128)b.qword[i - 1] >> (imm - 1)) & 1);\n    }\n    dst.dword[i] = clamp<u128>(temp, 0, 9223372036854775807);\n  } else {\n    u128 temp;\n    if (imm == 0) {\n      temp = (u128)a.qword[i - 2];\n    } else {\n      temp = ((u128)a.qword[i - 2] >> imm) +\n             (((u128)a.qword[i - 2] >> (imm - 1)) & 1);\n    }\n    dst.dword[i] = clamp<u128>(temp, 0, 9223372036854775807);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvssrlrni_d_q (__m256i a, __m256i b, imm0_127 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvssrlrni.d.q xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift (with rounding) the unsigned 128-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in signed 64-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  if (i &lt; 1) {\n    u128 temp;\n    if (imm == 0) {\n      temp = (u128)b.qword[i];\n    } else {\n      temp = ((u128)b.qword[i] &gt;&gt; imm) + (((u128)b.qword[i] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.dword[i] = clamp&lt;u128&gt;(temp, 0, 9223372036854775807);\n  } else {\n    u128 temp;\n    if (imm == 0) {\n      temp = (u128)a.qword[i - 1];\n    } else {\n      temp = ((u128)a.qword[i - 1] &gt;&gt; imm) +\n             (((u128)a.qword[i - 1] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.dword[i] = clamp&lt;u128&gt;(temp, 0, 9223372036854775807);\n  }\n}\nfor (int i = 2; i &lt; 4; i++) {\n  if (i &lt; 3) {\n    u128 temp;\n    if (imm == 0) {\n      temp = (u128)b.qword[i - 1];\n    } else {\n      temp = ((u128)b.qword[i - 1] &gt;&gt; imm) +\n             (((u128)b.qword[i - 1] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.dword[i] = clamp&lt;u128&gt;(temp, 0, 9223372036854775807);\n  } else {\n    u128 temp;\n    if (imm == 0) {\n      temp = (u128)a.qword[i - 2];\n    } else {\n      temp = ((u128)a.qword[i - 2] &gt;&gt; imm) +\n             (((u128)a.qword[i - 2] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.dword[i] = clamp&lt;u128&gt;(temp, 0, 9223372036854775807);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvssrlrni_du_q (__m256i a, __m256i b, imm0_127 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvssrlrni_du_q (__m256i a, __m256i b, imm0_127 imm)\n#include <lasxintrin.h>\nInstruction: xvssrlrni.du.q xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nLogical right shift (with rounding) the unsigned 128-bit elements in `a` and `b` by `imm`, clamp to fit in unsigned 64-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  if (i < 1) {\n    u128 temp;\n    if (imm == 0) {\n      temp = (u128)b.qword[i];\n    } else {\n      temp = ((u128)b.qword[i] >> imm) + (((u128)b.qword[i] >> (imm - 1)) & 1);\n    }\n    dst.dword[i] = clamp<u128>(temp, 0, 18446744073709551615);\n  } else {\n    u128 temp;\n    if (imm == 0) {\n      temp = (u128)a.qword[i - 1];\n    } else {\n      temp = ((u128)a.qword[i - 1] >> imm) +\n             (((u128)a.qword[i - 1] >> (imm - 1)) & 1);\n    }\n    dst.dword[i] = clamp<u128>(temp, 0, 18446744073709551615);\n  }\n}\nfor (int i = 2; i < 4; i++) {\n  if (i < 3) {\n    u128 temp;\n    if (imm == 0) {\n      temp = (u128)b.qword[i - 1];\n    } else {\n      temp = ((u128)b.qword[i - 1] >> imm) +\n             (((u128)b.qword[i - 1] >> (imm - 1)) & 1);\n    }\n    dst.dword[i] = clamp<u128>(temp, 0, 18446744073709551615);\n  } else {\n    u128 temp;\n    if (imm == 0) {\n      temp = (u128)a.qword[i - 2];\n    } else {\n      temp = ((u128)a.qword[i - 2] >> imm) +\n             (((u128)a.qword[i - 2] >> (imm - 1)) & 1);\n    }\n    dst.dword[i] = clamp<u128>(temp, 0, 18446744073709551615);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvssrlrni_du_q (__m256i a, __m256i b, imm0_127 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvssrlrni.du.q xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift (with rounding) the unsigned 128-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in unsigned 64-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  if (i &lt; 1) {\n    u128 temp;\n    if (imm == 0) {\n      temp = (u128)b.qword[i];\n    } else {\n      temp = ((u128)b.qword[i] &gt;&gt; imm) + (((u128)b.qword[i] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.dword[i] = clamp&lt;u128&gt;(temp, 0, 18446744073709551615);\n  } else {\n    u128 temp;\n    if (imm == 0) {\n      temp = (u128)a.qword[i - 1];\n    } else {\n      temp = ((u128)a.qword[i - 1] &gt;&gt; imm) +\n             (((u128)a.qword[i - 1] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.dword[i] = clamp&lt;u128&gt;(temp, 0, 18446744073709551615);\n  }\n}\nfor (int i = 2; i &lt; 4; i++) {\n  if (i &lt; 3) {\n    u128 temp;\n    if (imm == 0) {\n      temp = (u128)b.qword[i - 1];\n    } else {\n      temp = ((u128)b.qword[i - 1] &gt;&gt; imm) +\n             (((u128)b.qword[i - 1] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.dword[i] = clamp&lt;u128&gt;(temp, 0, 18446744073709551615);\n  } else {\n    u128 temp;\n    if (imm == 0) {\n      temp = (u128)a.qword[i - 2];\n    } else {\n      temp = ((u128)a.qword[i - 2] &gt;&gt; imm) +\n             (((u128)a.qword[i - 2] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.dword[i] = clamp&lt;u128&gt;(temp, 0, 18446744073709551615);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvssrlrni_h_w (__m256i a, __m256i b, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvssrlrni_h_w (__m256i a, __m256i b, imm0_31 imm)\n#include <lasxintrin.h>\nInstruction: xvssrlrni.h.w xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nLogical right shift (with rounding) the unsigned 32-bit elements in `a` and `b` by `imm`, clamp to fit in signed 16-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  if (i < 4) {\n    u32 temp;\n    if (imm == 0) {\n      temp = (u32)b.word[i];\n    } else {\n      temp = ((u32)b.word[i] >> imm) + (((u32)b.word[i] >> (imm - 1)) & 1);\n    }\n    dst.half[i] = clamp<u32>(temp, 0, 32767);\n  } else {\n    u32 temp;\n    if (imm == 0) {\n      temp = (u32)a.word[i - 4];\n    } else {\n      temp =\n          ((u32)a.word[i - 4] >> imm) + (((u32)a.word[i - 4] >> (imm - 1)) & 1);\n    }\n    dst.half[i] = clamp<u32>(temp, 0, 32767);\n  }\n}\nfor (int i = 8; i < 16; i++) {\n  if (i < 12) {\n    u32 temp;\n    if (imm == 0) {\n      temp = (u32)b.word[i - 4];\n    } else {\n      temp =\n          ((u32)b.word[i - 4] >> imm) + (((u32)b.word[i - 4] >> (imm - 1)) & 1);\n    }\n    dst.half[i] = clamp<u32>(temp, 0, 32767);\n  } else {\n    u32 temp;\n    if (imm == 0) {\n      temp = (u32)a.word[i - 8];\n    } else {\n      temp =\n          ((u32)a.word[i - 8] >> imm) + (((u32)a.word[i - 8] >> (imm - 1)) & 1);\n    }\n    dst.half[i] = clamp<u32>(temp, 0, 32767);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvssrlrni_h_w (__m256i a, __m256i b, imm0_31 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvssrlrni.h.w xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift (with rounding) the unsigned 32-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in signed 16-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  if (i &lt; 4) {\n    u32 temp;\n    if (imm == 0) {\n      temp = (u32)b.word[i];\n    } else {\n      temp = ((u32)b.word[i] &gt;&gt; imm) + (((u32)b.word[i] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.half[i] = clamp&lt;u32&gt;(temp, 0, 32767);\n  } else {\n    u32 temp;\n    if (imm == 0) {\n      temp = (u32)a.word[i - 4];\n    } else {\n      temp =\n          ((u32)a.word[i - 4] &gt;&gt; imm) + (((u32)a.word[i - 4] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.half[i] = clamp&lt;u32&gt;(temp, 0, 32767);\n  }\n}\nfor (int i = 8; i &lt; 16; i++) {\n  if (i &lt; 12) {\n    u32 temp;\n    if (imm == 0) {\n      temp = (u32)b.word[i - 4];\n    } else {\n      temp =\n          ((u32)b.word[i - 4] &gt;&gt; imm) + (((u32)b.word[i - 4] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.half[i] = clamp&lt;u32&gt;(temp, 0, 32767);\n  } else {\n    u32 temp;\n    if (imm == 0) {\n      temp = (u32)a.word[i - 8];\n    } else {\n      temp =\n          ((u32)a.word[i - 8] &gt;&gt; imm) + (((u32)a.word[i - 8] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.half[i] = clamp&lt;u32&gt;(temp, 0, 32767);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvssrlrni_hu_w (__m256i a, __m256i b, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvssrlrni_hu_w (__m256i a, __m256i b, imm0_31 imm)\n#include <lasxintrin.h>\nInstruction: xvssrlrni.hu.w xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nLogical right shift (with rounding) the unsigned 32-bit elements in `a` and `b` by `imm`, clamp to fit in unsigned 16-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  if (i < 4) {\n    u32 temp;\n    if (imm == 0) {\n      temp = (u32)b.word[i];\n    } else {\n      temp = ((u32)b.word[i] >> imm) + (((u32)b.word[i] >> (imm - 1)) & 1);\n    }\n    dst.half[i] = clamp<u32>(temp, 0, 65535);\n  } else {\n    u32 temp;\n    if (imm == 0) {\n      temp = (u32)a.word[i - 4];\n    } else {\n      temp =\n          ((u32)a.word[i - 4] >> imm) + (((u32)a.word[i - 4] >> (imm - 1)) & 1);\n    }\n    dst.half[i] = clamp<u32>(temp, 0, 65535);\n  }\n}\nfor (int i = 8; i < 16; i++) {\n  if (i < 12) {\n    u32 temp;\n    if (imm == 0) {\n      temp = (u32)b.word[i - 4];\n    } else {\n      temp =\n          ((u32)b.word[i - 4] >> imm) + (((u32)b.word[i - 4] >> (imm - 1)) & 1);\n    }\n    dst.half[i] = clamp<u32>(temp, 0, 65535);\n  } else {\n    u32 temp;\n    if (imm == 0) {\n      temp = (u32)a.word[i - 8];\n    } else {\n      temp =\n          ((u32)a.word[i - 8] >> imm) + (((u32)a.word[i - 8] >> (imm - 1)) & 1);\n    }\n    dst.half[i] = clamp<u32>(temp, 0, 65535);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvssrlrni_hu_w (__m256i a, __m256i b, imm0_31 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvssrlrni.hu.w xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift (with rounding) the unsigned 32-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in unsigned 16-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  if (i &lt; 4) {\n    u32 temp;\n    if (imm == 0) {\n      temp = (u32)b.word[i];\n    } else {\n      temp = ((u32)b.word[i] &gt;&gt; imm) + (((u32)b.word[i] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.half[i] = clamp&lt;u32&gt;(temp, 0, 65535);\n  } else {\n    u32 temp;\n    if (imm == 0) {\n      temp = (u32)a.word[i - 4];\n    } else {\n      temp =\n          ((u32)a.word[i - 4] &gt;&gt; imm) + (((u32)a.word[i - 4] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.half[i] = clamp&lt;u32&gt;(temp, 0, 65535);\n  }\n}\nfor (int i = 8; i &lt; 16; i++) {\n  if (i &lt; 12) {\n    u32 temp;\n    if (imm == 0) {\n      temp = (u32)b.word[i - 4];\n    } else {\n      temp =\n          ((u32)b.word[i - 4] &gt;&gt; imm) + (((u32)b.word[i - 4] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.half[i] = clamp&lt;u32&gt;(temp, 0, 65535);\n  } else {\n    u32 temp;\n    if (imm == 0) {\n      temp = (u32)a.word[i - 8];\n    } else {\n      temp =\n          ((u32)a.word[i - 8] &gt;&gt; imm) + (((u32)a.word[i - 8] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.half[i] = clamp&lt;u32&gt;(temp, 0, 65535);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvssrlrni_w_d (__m256i a, __m256i b, imm0_63 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvssrlrni_w_d (__m256i a, __m256i b, imm0_63 imm)\n#include <lasxintrin.h>\nInstruction: xvssrlrni.w.d xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nLogical right shift (with rounding) the unsigned 64-bit elements in `a` and `b` by `imm`, clamp to fit in signed 32-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (i < 2) {\n    u64 temp;\n    if (imm == 0) {\n      temp = (u64)b.dword[i];\n    } else {\n      temp = ((u64)b.dword[i] >> imm) + (((u64)b.dword[i] >> (imm - 1)) & 1);\n    }\n    dst.word[i] = clamp<u64>(temp, 0, 2147483647);\n  } else {\n    u64 temp;\n    if (imm == 0) {\n      temp = (u64)a.dword[i - 2];\n    } else {\n      temp = ((u64)a.dword[i - 2] >> imm) +\n             (((u64)a.dword[i - 2] >> (imm - 1)) & 1);\n    }\n    dst.word[i] = clamp<u64>(temp, 0, 2147483647);\n  }\n}\nfor (int i = 4; i < 8; i++) {\n  if (i < 6) {\n    u64 temp;\n    if (imm == 0) {\n      temp = (u64)b.dword[i - 2];\n    } else {\n      temp = ((u64)b.dword[i - 2] >> imm) +\n             (((u64)b.dword[i - 2] >> (imm - 1)) & 1);\n    }\n    dst.word[i] = clamp<u64>(temp, 0, 2147483647);\n  } else {\n    u64 temp;\n    if (imm == 0) {\n      temp = (u64)a.dword[i - 4];\n    } else {\n      temp = ((u64)a.dword[i - 4] >> imm) +\n             (((u64)a.dword[i - 4] >> (imm - 1)) & 1);\n    }\n    dst.word[i] = clamp<u64>(temp, 0, 2147483647);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvssrlrni_w_d (__m256i a, __m256i b, imm0_63 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvssrlrni.w.d xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift (with rounding) the unsigned 64-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in signed 32-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (i &lt; 2) {\n    u64 temp;\n    if (imm == 0) {\n      temp = (u64)b.dword[i];\n    } else {\n      temp = ((u64)b.dword[i] &gt;&gt; imm) + (((u64)b.dword[i] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.word[i] = clamp&lt;u64&gt;(temp, 0, 2147483647);\n  } else {\n    u64 temp;\n    if (imm == 0) {\n      temp = (u64)a.dword[i - 2];\n    } else {\n      temp = ((u64)a.dword[i - 2] &gt;&gt; imm) +\n             (((u64)a.dword[i - 2] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.word[i] = clamp&lt;u64&gt;(temp, 0, 2147483647);\n  }\n}\nfor (int i = 4; i &lt; 8; i++) {\n  if (i &lt; 6) {\n    u64 temp;\n    if (imm == 0) {\n      temp = (u64)b.dword[i - 2];\n    } else {\n      temp = ((u64)b.dword[i - 2] &gt;&gt; imm) +\n             (((u64)b.dword[i - 2] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.word[i] = clamp&lt;u64&gt;(temp, 0, 2147483647);\n  } else {\n    u64 temp;\n    if (imm == 0) {\n      temp = (u64)a.dword[i - 4];\n    } else {\n      temp = ((u64)a.dword[i - 4] &gt;&gt; imm) +\n             (((u64)a.dword[i - 4] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.word[i] = clamp&lt;u64&gt;(temp, 0, 2147483647);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvssrlrni_wu_d (__m256i a, __m256i b, imm0_63 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvssrlrni_wu_d (__m256i a, __m256i b, imm0_63 imm)\n#include <lasxintrin.h>\nInstruction: xvssrlrni.wu.d xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nLogical right shift (with rounding) the unsigned 64-bit elements in `a` and `b` by `imm`, clamp to fit in unsigned 32-bit integer and store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  if (i < 2) {\n    u64 temp;\n    if (imm == 0) {\n      temp = (u64)b.dword[i];\n    } else {\n      temp = ((u64)b.dword[i] >> imm) + (((u64)b.dword[i] >> (imm - 1)) & 1);\n    }\n    dst.word[i] = clamp<u64>(temp, 0, 4294967295);\n  } else {\n    u64 temp;\n    if (imm == 0) {\n      temp = (u64)a.dword[i - 2];\n    } else {\n      temp = ((u64)a.dword[i - 2] >> imm) +\n             (((u64)a.dword[i - 2] >> (imm - 1)) & 1);\n    }\n    dst.word[i] = clamp<u64>(temp, 0, 4294967295);\n  }\n}\nfor (int i = 4; i < 8; i++) {\n  if (i < 6) {\n    u64 temp;\n    if (imm == 0) {\n      temp = (u64)b.dword[i - 2];\n    } else {\n      temp = ((u64)b.dword[i - 2] >> imm) +\n             (((u64)b.dword[i - 2] >> (imm - 1)) & 1);\n    }\n    dst.word[i] = clamp<u64>(temp, 0, 4294967295);\n  } else {\n    u64 temp;\n    if (imm == 0) {\n      temp = (u64)a.dword[i - 4];\n    } else {\n      temp = ((u64)a.dword[i - 4] >> imm) +\n             (((u64)a.dword[i - 4] >> (imm - 1)) & 1);\n    }\n    dst.word[i] = clamp<u64>(temp, 0, 4294967295);\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 4 | 2 |\n| 3C5000 | 4 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvssrlrni_wu_d (__m256i a, __m256i b, imm0_63 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvssrlrni.wu.d xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Logical right shift (with rounding) the unsigned 64-bit elements in <code>a</code> and <code>b</code> by <code>imm</code>, clamp to fit in unsigned 32-bit integer and store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  if (i &lt; 2) {\n    u64 temp;\n    if (imm == 0) {\n      temp = (u64)b.dword[i];\n    } else {\n      temp = ((u64)b.dword[i] &gt;&gt; imm) + (((u64)b.dword[i] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.word[i] = clamp&lt;u64&gt;(temp, 0, 4294967295);\n  } else {\n    u64 temp;\n    if (imm == 0) {\n      temp = (u64)a.dword[i - 2];\n    } else {\n      temp = ((u64)a.dword[i - 2] &gt;&gt; imm) +\n             (((u64)a.dword[i - 2] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.word[i] = clamp&lt;u64&gt;(temp, 0, 4294967295);\n  }\n}\nfor (int i = 4; i &lt; 8; i++) {\n  if (i &lt; 6) {\n    u64 temp;\n    if (imm == 0) {\n      temp = (u64)b.dword[i - 2];\n    } else {\n      temp = ((u64)b.dword[i - 2] &gt;&gt; imm) +\n             (((u64)b.dword[i - 2] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.word[i] = clamp&lt;u64&gt;(temp, 0, 4294967295);\n  } else {\n    u64 temp;\n    if (imm == 0) {\n      temp = (u64)a.dword[i - 4];\n    } else {\n      temp = ((u64)a.dword[i - 4] &gt;&gt; imm) +\n             (((u64)a.dword[i - 4] &gt;&gt; (imm - 1)) &amp; 1);\n    }\n    dst.word[i] = clamp&lt;u64&gt;(temp, 0, 4294967295);\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>4</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>4</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Shift", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvssub_b (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvssub_b (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvssub.b xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nSaturing subtract the signed 8-bit elements in `a` and `b`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 32; i++) {\n  dst.byte[i] = (s8)ssub((s8)a.byte[i], (s8)b.byte[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvssub_b (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvssub.b xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Saturing subtract the signed 8-bit elements in <code>a</code> and <code>b</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 32; i++) {\n  dst.byte[i] = (s8)ssub((s8)a.byte[i], (s8)b.byte[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvssub_bu (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvssub_bu (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvssub.bu xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nSaturing subtract the unsigned 8-bit elements in `a` and `b`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 32; i++) {\n  dst.byte[i] = (u8)ssub((u8)a.byte[i], (u8)b.byte[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvssub_bu (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvssub.bu xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Saturing subtract the unsigned 8-bit elements in <code>a</code> and <code>b</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 32; i++) {\n  dst.byte[i] = (u8)ssub((u8)a.byte[i], (u8)b.byte[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvssub_d (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvssub_d (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvssub.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nSaturing subtract the signed 64-bit elements in `a` and `b`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = (s64)ssub((s64)a.dword[i], (s64)b.dword[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvssub_d (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvssub.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Saturing subtract the signed 64-bit elements in <code>a</code> and <code>b</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = (s64)ssub((s64)a.dword[i], (s64)b.dword[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvssub_du (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvssub_du (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvssub.du xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nSaturing subtract the unsigned 64-bit elements in `a` and `b`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = (u64)ssub((u64)a.dword[i], (u64)b.dword[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvssub_du (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvssub.du xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Saturing subtract the unsigned 64-bit elements in <code>a</code> and <code>b</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = (u64)ssub((u64)a.dword[i], (u64)b.dword[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvssub_h (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvssub_h (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvssub.h xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nSaturing subtract the signed 16-bit elements in `a` and `b`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = (s16)ssub((s16)a.half[i], (s16)b.half[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvssub_h (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvssub.h xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Saturing subtract the signed 16-bit elements in <code>a</code> and <code>b</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = (s16)ssub((s16)a.half[i], (s16)b.half[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvssub_hu (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvssub_hu (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvssub.hu xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nSaturing subtract the unsigned 16-bit elements in `a` and `b`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = (u16)ssub((u16)a.half[i], (u16)b.half[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvssub_hu (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvssub.hu xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Saturing subtract the unsigned 16-bit elements in <code>a</code> and <code>b</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = (u16)ssub((u16)a.half[i], (u16)b.half[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvssub_w (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvssub_w (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvssub.w xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nSaturing subtract the signed 32-bit elements in `a` and `b`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = (s32)ssub((s32)a.word[i], (s32)b.word[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvssub_w (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvssub.w xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Saturing subtract the signed 32-bit elements in <code>a</code> and <code>b</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = (s32)ssub((s32)a.word[i], (s32)b.word[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvssub_wu (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvssub_wu (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvssub.wu xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nSaturing subtract the unsigned 32-bit elements in `a` and `b`, store the result to `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = (u32)ssub((u32)a.word[i], (u32)b.word[i]);\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvssub_wu (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvssub.wu xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Saturing subtract the unsigned 32-bit elements in <code>a</code> and <code>b</code>, store the result to <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = (u32)ssub((u32)a.word[i], (u32)b.word[i]);\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsub_b (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsub_b (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvsub.b xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nSubtract 8-bit elements in `a` and `b`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 32; i++) {\n  dst.byte[i] = a.byte[i] - b.byte[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsub_b (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsub.b xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Subtract 8-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 32; i++) {\n  dst.byte[i] = a.byte[i] - b.byte[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsub_d (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsub_d (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvsub.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nSubtract 64-bit elements in `a` and `b`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = a.dword[i] - b.dword[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsub_d (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsub.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Subtract 64-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = a.dword[i] - b.dword[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsub_h (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsub_h (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvsub.h xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nSubtract 16-bit elements in `a` and `b`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = a.half[i] - b.half[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsub_h (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsub.h xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Subtract 16-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = a.half[i] - b.half[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsub_q (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsub_q (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvsub.q xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nSubtract 128-bit elements in `a` and `b`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.qword[i] = a.qword[i] - b.qword[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsub_q (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsub.q xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Subtract 128-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.qword[i] = a.qword[i] - b.qword[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsub_w (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsub_w (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvsub.w xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nSubtract 32-bit elements in `a` and `b`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = a.word[i] - b.word[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsub_w (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsub.w xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Subtract 32-bit elements in <code>a</code> and <code>b</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = a.word[i] - b.word[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsubi_bu (__m256i a, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsubi_bu (__m256i a, imm0_31 imm)\n#include <lasxintrin.h>\nInstruction: xvsubi.bu xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nSubtract 8-bit elements in `a` by `imm`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 32; i++) {\n  dst.byte[i] = a.byte[i] - imm;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsubi_bu (__m256i a, imm0_31 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsubi.bu xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Subtract 8-bit elements in <code>a</code> by <code>imm</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 32; i++) {\n  dst.byte[i] = a.byte[i] - imm;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsubi_du (__m256i a, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsubi_du (__m256i a, imm0_31 imm)\n#include <lasxintrin.h>\nInstruction: xvsubi.du xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nSubtract 64-bit elements in `a` by `imm`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = a.dword[i] - imm;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsubi_du (__m256i a, imm0_31 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsubi.du xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Subtract 64-bit elements in <code>a</code> by <code>imm</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = a.dword[i] - imm;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsubi_hu (__m256i a, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsubi_hu (__m256i a, imm0_31 imm)\n#include <lasxintrin.h>\nInstruction: xvsubi.hu xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nSubtract 16-bit elements in `a` by `imm`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = a.half[i] - imm;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsubi_hu (__m256i a, imm0_31 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsubi.hu xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Subtract 16-bit elements in <code>a</code> by <code>imm</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = a.half[i] - imm;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsubi_wu (__m256i a, imm0_31 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsubi_wu (__m256i a, imm0_31 imm)\n#include <lasxintrin.h>\nInstruction: xvsubi.wu xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nSubtract 32-bit elements in `a` by `imm`, save the result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = a.word[i] - imm;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsubi_wu (__m256i a, imm0_31 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsubi.wu xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Subtract 32-bit elements in <code>a</code> by <code>imm</code>, save the result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = a.word[i] - imm;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsubwev_d_w (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsubwev_d_w (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvsubwev.d.w xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nSubtract even-positioned signed 32-bit elements in `a` and signed elements in `b`, save the 64-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = (s64)(s32)a.word[2 * i] - (s64)(s32)b.word[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsubwev_d_w (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsubwev.d.w xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Subtract even-positioned signed 32-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 64-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = (s64)(s32)a.word[2 * i] - (s64)(s32)b.word[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsubwev_d_wu (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsubwev_d_wu (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvsubwev.d.wu xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nSubtract even-positioned unsigned 32-bit elements in `a` and unsigned elements in `b`, save the 64-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = (u64)(u32)a.word[2 * i] - (u64)(u32)b.word[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsubwev_d_wu (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsubwev.d.wu xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Subtract even-positioned unsigned 32-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 64-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = (u64)(u32)a.word[2 * i] - (u64)(u32)b.word[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsubwev_h_b (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsubwev_h_b (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvsubwev.h.b xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nSubtract even-positioned signed 8-bit elements in `a` and signed elements in `b`, save the 16-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = (s16)(s8)a.byte[2 * i] - (s16)(s8)b.byte[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsubwev_h_b (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsubwev.h.b xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Subtract even-positioned signed 8-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 16-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = (s16)(s8)a.byte[2 * i] - (s16)(s8)b.byte[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsubwev_h_bu (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsubwev_h_bu (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvsubwev.h.bu xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nSubtract even-positioned unsigned 8-bit elements in `a` and unsigned elements in `b`, save the 16-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = (u16)(u8)a.byte[2 * i] - (u16)(u8)b.byte[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsubwev_h_bu (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsubwev.h.bu xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Subtract even-positioned unsigned 8-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 16-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = (u16)(u8)a.byte[2 * i] - (u16)(u8)b.byte[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsubwev_q_d (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsubwev_q_d (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvsubwev.q.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nSubtract even-positioned signed 64-bit elements in `a` and signed elements in `b`, save the 128-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.qword[i] = (s128)(s64)a.dword[2 * i] - (s128)(s64)b.dword[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsubwev_q_d (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsubwev.q.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Subtract even-positioned signed 64-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 128-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.qword[i] = (s128)(s64)a.dword[2 * i] - (s128)(s64)b.dword[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsubwev_q_du (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsubwev_q_du (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvsubwev.q.du xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nSubtract even-positioned unsigned 64-bit elements in `a` and unsigned elements in `b`, save the 128-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.qword[i] = (u128)(u64)a.dword[2 * i] - (u128)(u64)b.dword[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsubwev_q_du (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsubwev.q.du xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Subtract even-positioned unsigned 64-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 128-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.qword[i] = (u128)(u64)a.dword[2 * i] - (u128)(u64)b.dword[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsubwev_w_h (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsubwev_w_h (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvsubwev.w.h xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nSubtract even-positioned signed 16-bit elements in `a` and signed elements in `b`, save the 32-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = (s32)(s16)a.half[2 * i] - (s32)(s16)b.half[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsubwev_w_h (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsubwev.w.h xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Subtract even-positioned signed 16-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 32-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = (s32)(s16)a.half[2 * i] - (s32)(s16)b.half[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsubwev_w_hu (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsubwev_w_hu (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvsubwev.w.hu xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nSubtract even-positioned unsigned 16-bit elements in `a` and unsigned elements in `b`, save the 32-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = (u32)(u16)a.half[2 * i] - (u32)(u16)b.half[2 * i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsubwev_w_hu (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsubwev.w.hu xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Subtract even-positioned unsigned 16-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 32-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = (u32)(u16)a.half[2 * i] - (u32)(u16)b.half[2 * i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsubwod_d_w (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsubwod_d_w (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvsubwod.d.w xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nSubtract odd-positioned signed 32-bit elements in `a` and signed elements in `b`, save the 64-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = (s64)(s32)a.word[2 * i + 1] - (s64)(s32)b.word[2 * i + 1];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsubwod_d_w (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsubwod.d.w xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Subtract odd-positioned signed 32-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 64-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = (s64)(s32)a.word[2 * i + 1] - (s64)(s32)b.word[2 * i + 1];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsubwod_d_wu (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsubwod_d_wu (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvsubwod.d.wu xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nSubtract odd-positioned unsigned 32-bit elements in `a` and unsigned elements in `b`, save the 64-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = (u64)(u32)a.word[2 * i + 1] - (u64)(u32)b.word[2 * i + 1];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsubwod_d_wu (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsubwod.d.wu xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Subtract odd-positioned unsigned 32-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 64-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = (u64)(u32)a.word[2 * i + 1] - (u64)(u32)b.word[2 * i + 1];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsubwod_h_b (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsubwod_h_b (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvsubwod.h.b xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nSubtract odd-positioned signed 8-bit elements in `a` and signed elements in `b`, save the 16-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = (s16)(s8)a.byte[2 * i + 1] - (s16)(s8)b.byte[2 * i + 1];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsubwod_h_b (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsubwod.h.b xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Subtract odd-positioned signed 8-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 16-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = (s16)(s8)a.byte[2 * i + 1] - (s16)(s8)b.byte[2 * i + 1];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsubwod_h_bu (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsubwod_h_bu (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvsubwod.h.bu xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nSubtract odd-positioned unsigned 8-bit elements in `a` and unsigned elements in `b`, save the 16-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 16; i++) {\n  dst.half[i] = (u16)(u8)a.byte[2 * i + 1] - (u16)(u8)b.byte[2 * i + 1];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsubwod_h_bu (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsubwod.h.bu xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Subtract odd-positioned unsigned 8-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 16-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 16; i++) {\n  dst.half[i] = (u16)(u8)a.byte[2 * i + 1] - (u16)(u8)b.byte[2 * i + 1];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsubwod_q_d (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsubwod_q_d (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvsubwod.q.d xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nSubtract odd-positioned signed 64-bit elements in `a` and signed elements in `b`, save the 128-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.qword[i] = (s128)(s64)a.dword[2 * i + 1] - (s128)(s64)b.dword[2 * i + 1];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsubwod_q_d (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsubwod.q.d xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Subtract odd-positioned signed 64-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 128-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.qword[i] = (s128)(s64)a.dword[2 * i + 1] - (s128)(s64)b.dword[2 * i + 1];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsubwod_q_du (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsubwod_q_du (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvsubwod.q.du xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nSubtract odd-positioned unsigned 64-bit elements in `a` and unsigned elements in `b`, save the 128-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 2; i++) {\n  dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] - (u128)(u64)b.dword[2 * i + 1];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 3 | 2 |\n| 3C5000 | 3 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsubwod_q_du (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsubwod.q.du xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Subtract odd-positioned unsigned 64-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 128-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 2; i++) {\n  dst.qword[i] = (u128)(u64)a.dword[2 * i + 1] - (u128)(u64)b.dword[2 * i + 1];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>3</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsubwod_w_h (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsubwod_w_h (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvsubwod.w.h xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nSubtract odd-positioned signed 16-bit elements in `a` and signed elements in `b`, save the 32-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = (s32)(s16)a.half[2 * i + 1] - (s32)(s16)b.half[2 * i + 1];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsubwod_w_h (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsubwod.w.h xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Subtract odd-positioned signed 16-bit elements in <code>a</code> and signed elements in <code>b</code>, save the 32-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = (s32)(s16)a.half[2 * i + 1] - (s32)(s16)b.half[2 * i + 1];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvsubwod_w_hu (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvsubwod_w_hu (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvsubwod.w.hu xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nSubtract odd-positioned unsigned 16-bit elements in `a` and unsigned elements in `b`, save the 32-bit result in `dst`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 8; i++) {\n  dst.word[i] = (u32)(u16)a.half[2 * i + 1] - (u32)(u16)b.half[2 * i + 1];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 2 | 2 |\n| 3C5000 | 2 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvsubwod_w_hu (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsubwod.w.hu xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Subtract odd-positioned unsigned 16-bit elements in <code>a</code> and unsigned elements in <code>b</code>, save the 32-bit result in <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 8; i++) {\n  dst.word[i] = (u32)(u16)a.half[2 * i + 1] - (u32)(u16)b.half[2 * i + 1];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>2</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Integer Computation", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvxor_v (__m256i a, __m256i b)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvxor_v (__m256i a, __m256i b)\n#include <lasxintrin.h>\nInstruction: xvxor.v xr, xr, xr\nCPU Flags: LASX\n```\n\n### Description\n\nCompute bitwise XOR between elements in `a` and `b`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 4; i++) {\n  dst.dword[i] = a.dword[i] ^ b.dword[i];\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvxor_v (__m256i a, __m256i b)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvxor.v xr, xr, xr\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute bitwise XOR between elements in <code>a</code> and <code>b</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 4; i++) {\n  dst.dword[i] = a.dword[i] ^ b.dword[i];\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Logical", "extension": "LASX", "display": true}, {"name": "__m256i __lasx_xvxori_b (__m256i a, imm0_255 imm)", "markdown": "### Synopsis\n\n```c++\n__m256i __lasx_xvxori_b (__m256i a, imm0_255 imm)\n#include <lasxintrin.h>\nInstruction: xvxori.b xr, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nCompute bitwise XOR between elements in `a` and `imm`.\n\n\n\n\n\n### Operation\n\n```c++\nfor (int i = 0; i < 32; i++) {\n  dst.byte[i] = a.byte[i] ^ imm;\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 4 |\n| 3C5000 | 1 | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">__m256i __lasx_xvxori_b (__m256i a, imm0_255 imm)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvxori.b xr, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Compute bitwise XOR between elements in <code>a</code> and <code>imm</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">for (int i = 0; i &lt; 32; i++) {\n  dst.byte[i] = a.byte[i] ^ imm;\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>4</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Logical", "extension": "LASX", "display": true}, {"name": "int __lasx_xbnz_b (__m256i a)", "markdown": "### Synopsis\n\n```c++\nint __lasx_xbnz_b (__m256i a)\n#include <lasxintrin.h>\nInstruction: xvsetallnez.b fcc, xr; bcnez\nCPU Flags: LASX\n```\n\n### Description\n\nExpected to be used in branches: branch if all 8-bit elements in `a` are non-zero.\n\n\n\n\n\n### Operation\n\n```c++\ndst = 1;\nfor (int i = 0; i < 32; i++) {\n  if (a.byte[i] == 0) {\n    dst = 0;\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | N/A | 2 |\n| 3C5000 | N/A | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">int __lasx_xbnz_b (__m256i a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsetallnez.b fcc, xr; bcnez\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Expected to be used in branches: branch if all 8-bit elements in <code>a</code> are non-zero.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">dst = 1;\nfor (int i = 0; i &lt; 32; i++) {\n  if (a.byte[i] == 0) {\n    dst = 0;\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>N/A</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>N/A</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Branch", "extension": "LASX", "display": true}, {"name": "int __lasx_xbnz_d (__m256i a)", "markdown": "### Synopsis\n\n```c++\nint __lasx_xbnz_d (__m256i a)\n#include <lasxintrin.h>\nInstruction: xvsetallnez.d fcc, xr; bcnez\nCPU Flags: LASX\n```\n\n### Description\n\nExpected to be used in branches: branch if all 64-bit elements in `a` are non-zero.\n\n\n\n\n\n### Operation\n\n```c++\ndst = 1;\nfor (int i = 0; i < 4; i++) {\n  if (a.dword[i] == 0) {\n    dst = 0;\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | N/A | 2 |\n| 3C5000 | N/A | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">int __lasx_xbnz_d (__m256i a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsetallnez.d fcc, xr; bcnez\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Expected to be used in branches: branch if all 64-bit elements in <code>a</code> are non-zero.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">dst = 1;\nfor (int i = 0; i &lt; 4; i++) {\n  if (a.dword[i] == 0) {\n    dst = 0;\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>N/A</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>N/A</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Branch", "extension": "LASX", "display": true}, {"name": "int __lasx_xbnz_h (__m256i a)", "markdown": "### Synopsis\n\n```c++\nint __lasx_xbnz_h (__m256i a)\n#include <lasxintrin.h>\nInstruction: xvsetallnez.h fcc, xr; bcnez\nCPU Flags: LASX\n```\n\n### Description\n\nExpected to be used in branches: branch if all 16-bit elements in `a` are non-zero.\n\n\n\n\n\n### Operation\n\n```c++\ndst = 1;\nfor (int i = 0; i < 16; i++) {\n  if (a.half[i] == 0) {\n    dst = 0;\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | N/A | 2 |\n| 3C5000 | N/A | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">int __lasx_xbnz_h (__m256i a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsetallnez.h fcc, xr; bcnez\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Expected to be used in branches: branch if all 16-bit elements in <code>a</code> are non-zero.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">dst = 1;\nfor (int i = 0; i &lt; 16; i++) {\n  if (a.half[i] == 0) {\n    dst = 0;\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>N/A</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>N/A</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Branch", "extension": "LASX", "display": true}, {"name": "int __lasx_xbnz_v (__m256i a)", "markdown": "### Synopsis\n\n```c++\nint __lasx_xbnz_v (__m256i a)\n#include <lasxintrin.h>\nInstruction: xvsetnez.v fcc, xr; bcnez\nCPU Flags: LASX\n```\n\n### Description\n\nExpected to be used in branches: branch if the whole vector `a` is non-zero.\n\n\n\n\n\n### Operation\n\n```c++\ndst = a.qword[0] != 0 || a.qword[1] != 0;\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | N/A | 2 |\n| 3C5000 | N/A | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">int __lasx_xbnz_v (__m256i a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsetnez.v fcc, xr; bcnez\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Expected to be used in branches: branch if the whole vector <code>a</code> is non-zero.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">dst = a.qword[0] != 0 || a.qword[1] != 0;\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>N/A</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>N/A</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Branch", "extension": "LASX", "display": true}, {"name": "int __lasx_xbnz_w (__m256i a)", "markdown": "### Synopsis\n\n```c++\nint __lasx_xbnz_w (__m256i a)\n#include <lasxintrin.h>\nInstruction: xvsetallnez.w fcc, xr; bcnez\nCPU Flags: LASX\n```\n\n### Description\n\nExpected to be used in branches: branch if all 32-bit elements in `a` are non-zero.\n\n\n\n\n\n### Operation\n\n```c++\ndst = 1;\nfor (int i = 0; i < 8; i++) {\n  if (a.word[i] == 0) {\n    dst = 0;\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | N/A | 2 |\n| 3C5000 | N/A | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">int __lasx_xbnz_w (__m256i a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsetallnez.w fcc, xr; bcnez\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Expected to be used in branches: branch if all 32-bit elements in <code>a</code> are non-zero.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">dst = 1;\nfor (int i = 0; i &lt; 8; i++) {\n  if (a.word[i] == 0) {\n    dst = 0;\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>N/A</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>N/A</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Branch", "extension": "LASX", "display": true}, {"name": "int __lasx_xbz_b (__m256i a)", "markdown": "### Synopsis\n\n```c++\nint __lasx_xbz_b (__m256i a)\n#include <lasxintrin.h>\nInstruction: xvsetanyeqz.b fcc, xr; bcnez\nCPU Flags: LASX\n```\n\n### Description\n\nExpected to be used in branches: branch if any 8-bit element in `a` equals to zero.\n\n\n\n\n\n### Operation\n\n```c++\ndst = 0;\nfor (int i = 0; i < 32; i++) {\n  if (a.byte[i] == 0) {\n    dst = 1;\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | N/A | 2 |\n| 3C5000 | N/A | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">int __lasx_xbz_b (__m256i a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsetanyeqz.b fcc, xr; bcnez\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Expected to be used in branches: branch if any 8-bit element in <code>a</code> equals to zero.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">dst = 0;\nfor (int i = 0; i &lt; 32; i++) {\n  if (a.byte[i] == 0) {\n    dst = 1;\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>N/A</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>N/A</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Branch", "extension": "LASX", "display": true}, {"name": "int __lasx_xbz_d (__m256i a)", "markdown": "### Synopsis\n\n```c++\nint __lasx_xbz_d (__m256i a)\n#include <lasxintrin.h>\nInstruction: xvsetanyeqz.d fcc, xr; bcnez\nCPU Flags: LASX\n```\n\n### Description\n\nExpected to be used in branches: branch if any 64-bit element in `a` equals to zero.\n\n\n\n\n\n### Operation\n\n```c++\ndst = 0;\nfor (int i = 0; i < 4; i++) {\n  if (a.dword[i] == 0) {\n    dst = 1;\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | N/A | 2 |\n| 3C5000 | N/A | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">int __lasx_xbz_d (__m256i a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsetanyeqz.d fcc, xr; bcnez\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Expected to be used in branches: branch if any 64-bit element in <code>a</code> equals to zero.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">dst = 0;\nfor (int i = 0; i &lt; 4; i++) {\n  if (a.dword[i] == 0) {\n    dst = 1;\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>N/A</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>N/A</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Branch", "extension": "LASX", "display": true}, {"name": "int __lasx_xbz_h (__m256i a)", "markdown": "### Synopsis\n\n```c++\nint __lasx_xbz_h (__m256i a)\n#include <lasxintrin.h>\nInstruction: xvsetanyeqz.h fcc, xr; bcnez\nCPU Flags: LASX\n```\n\n### Description\n\nExpected to be used in branches: branch if any 16-bit element in `a` equals to zero.\n\n\n\n\n\n### Operation\n\n```c++\ndst = 0;\nfor (int i = 0; i < 16; i++) {\n  if (a.half[i] == 0) {\n    dst = 1;\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | N/A | 2 |\n| 3C5000 | N/A | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">int __lasx_xbz_h (__m256i a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsetanyeqz.h fcc, xr; bcnez\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Expected to be used in branches: branch if any 16-bit element in <code>a</code> equals to zero.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">dst = 0;\nfor (int i = 0; i &lt; 16; i++) {\n  if (a.half[i] == 0) {\n    dst = 1;\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>N/A</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>N/A</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Branch", "extension": "LASX", "display": true}, {"name": "int __lasx_xbz_v (__m256i a)", "markdown": "### Synopsis\n\n```c++\nint __lasx_xbz_v (__m256i a)\n#include <lasxintrin.h>\nInstruction: xvseteqz.v fcc, xr; bcnez\nCPU Flags: LASX\n```\n\n### Description\n\nExpected to be used in branches: branch if the whole vector `a` equals to zero.\n\n\n\n\n\n### Operation\n\n```c++\ndst = a.qword[0] == 0 && a.qword[1] == 0;\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | N/A | 2 |\n| 3C5000 | N/A | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">int __lasx_xbz_v (__m256i a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvseteqz.v fcc, xr; bcnez\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Expected to be used in branches: branch if the whole vector <code>a</code> equals to zero.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">dst = a.qword[0] == 0 &amp;&amp; a.qword[1] == 0;\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>N/A</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>N/A</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Branch", "extension": "LASX", "display": true}, {"name": "int __lasx_xbz_w (__m256i a)", "markdown": "### Synopsis\n\n```c++\nint __lasx_xbz_w (__m256i a)\n#include <lasxintrin.h>\nInstruction: xvsetanyeqz.w fcc, xr; bcnez\nCPU Flags: LASX\n```\n\n### Description\n\nExpected to be used in branches: branch if any 32-bit element in `a` equals to zero.\n\n\n\n\n\n### Operation\n\n```c++\ndst = 0;\nfor (int i = 0; i < 8; i++) {\n  if (a.word[i] == 0) {\n    dst = 1;\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | N/A | 2 |\n| 3C5000 | N/A | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">int __lasx_xbz_w (__m256i a)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvsetanyeqz.w fcc, xr; bcnez\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Expected to be used in branches: branch if any 32-bit element in <code>a</code> equals to zero.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">dst = 0;\nfor (int i = 0; i &lt; 8; i++) {\n  if (a.word[i] == 0) {\n    dst = 1;\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>N/A</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>N/A</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Branch", "extension": "LASX", "display": true}, {"name": "int __lasx_xvpickve2gr_w (__m256i a, imm0_7 idx)", "markdown": "### Synopsis\n\n```c++\nint __lasx_xvpickve2gr_w (__m256i a, imm0_7 idx)\n#include <lasxintrin.h>\nInstruction: xvpickve2gr.w r, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nPick the `lane` specified by `idx` from `a` and store into `dst`.\n\n\n\n\n\n### Operation\n\n```c++\ndst = (s32)a.word[idx];\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 1 |\n| 3C5000 | 1 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">int __lasx_xvpickve2gr_w (__m256i a, imm0_7 idx)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvpickve2gr.w r, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Pick the <code>lane</code> specified by <code>idx</code> from <code>a</code> and store into <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">dst = (s32)a.word[idx];\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>1</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "int __lsx_bnz_b (__m128i a)", "markdown": "### Synopsis\n\n```c++\nint __lsx_bnz_b (__m128i a)\n#include <lsxintrin.h>\nInstruction: vsetallnez.b fcc, vr; bcnez\nCPU Flags: LSX\n```\n\n### Description\n\nExpected to be used in branches: branch if all 8-bit elements in `a` are non-zero.\n\n\n\n\n\n### Operation\n\n```c++\ndst = 1;\nfor (int i = 0; i < 16; i++) {\n  if (a.byte[i] == 0) {\n    dst = 0;\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | N/A | 2 |\n| 3C5000 | N/A | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">int __lsx_bnz_b (__m128i a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsetallnez.b fcc, vr; bcnez\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Expected to be used in branches: branch if all 8-bit elements in <code>a</code> are non-zero.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">dst = 1;\nfor (int i = 0; i &lt; 16; i++) {\n  if (a.byte[i] == 0) {\n    dst = 0;\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>N/A</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>N/A</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Branch", "extension": "LSX", "display": true}, {"name": "int __lsx_bnz_d (__m128i a)", "markdown": "### Synopsis\n\n```c++\nint __lsx_bnz_d (__m128i a)\n#include <lsxintrin.h>\nInstruction: vsetallnez.d fcc, vr; bcnez\nCPU Flags: LSX\n```\n\n### Description\n\nExpected to be used in branches: branch if all 64-bit elements in `a` are non-zero.\n\n\n\n\n\n### Operation\n\n```c++\ndst = 1;\nfor (int i = 0; i < 2; i++) {\n  if (a.dword[i] == 0) {\n    dst = 0;\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | N/A | 2 |\n| 3C5000 | N/A | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">int __lsx_bnz_d (__m128i a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsetallnez.d fcc, vr; bcnez\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Expected to be used in branches: branch if all 64-bit elements in <code>a</code> are non-zero.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">dst = 1;\nfor (int i = 0; i &lt; 2; i++) {\n  if (a.dword[i] == 0) {\n    dst = 0;\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>N/A</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>N/A</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Branch", "extension": "LSX", "display": true}, {"name": "int __lsx_bnz_h (__m128i a)", "markdown": "### Synopsis\n\n```c++\nint __lsx_bnz_h (__m128i a)\n#include <lsxintrin.h>\nInstruction: vsetallnez.h fcc, vr; bcnez\nCPU Flags: LSX\n```\n\n### Description\n\nExpected to be used in branches: branch if all 16-bit elements in `a` are non-zero.\n\n\n\n\n\n### Operation\n\n```c++\ndst = 1;\nfor (int i = 0; i < 8; i++) {\n  if (a.half[i] == 0) {\n    dst = 0;\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | N/A | 2 |\n| 3C5000 | N/A | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">int __lsx_bnz_h (__m128i a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsetallnez.h fcc, vr; bcnez\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Expected to be used in branches: branch if all 16-bit elements in <code>a</code> are non-zero.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">dst = 1;\nfor (int i = 0; i &lt; 8; i++) {\n  if (a.half[i] == 0) {\n    dst = 0;\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>N/A</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>N/A</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Branch", "extension": "LSX", "display": true}, {"name": "int __lsx_bnz_v (__m128i a)", "markdown": "### Synopsis\n\n```c++\nint __lsx_bnz_v (__m128i a)\n#include <lsxintrin.h>\nInstruction: vsetnez.v fcc, vr; bcnez\nCPU Flags: LSX\n```\n\n### Description\n\nExpected to be used in branches: branch if the whole vector `a` is non-zero.\n\n\n\n\n\n### Operation\n\n```c++\ndst = a.qword[0] != 0;\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | N/A | 2 |\n| 3C5000 | N/A | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">int __lsx_bnz_v (__m128i a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsetnez.v fcc, vr; bcnez\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Expected to be used in branches: branch if the whole vector <code>a</code> is non-zero.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">dst = a.qword[0] != 0;\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>N/A</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>N/A</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Branch", "extension": "LSX", "display": true}, {"name": "int __lsx_bnz_w (__m128i a)", "markdown": "### Synopsis\n\n```c++\nint __lsx_bnz_w (__m128i a)\n#include <lsxintrin.h>\nInstruction: vsetallnez.w fcc, vr; bcnez\nCPU Flags: LSX\n```\n\n### Description\n\nExpected to be used in branches: branch if all 32-bit elements in `a` are non-zero.\n\n\n\n\n\n### Operation\n\n```c++\ndst = 1;\nfor (int i = 0; i < 4; i++) {\n  if (a.word[i] == 0) {\n    dst = 0;\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | N/A | 2 |\n| 3C5000 | N/A | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">int __lsx_bnz_w (__m128i a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsetallnez.w fcc, vr; bcnez\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Expected to be used in branches: branch if all 32-bit elements in <code>a</code> are non-zero.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">dst = 1;\nfor (int i = 0; i &lt; 4; i++) {\n  if (a.word[i] == 0) {\n    dst = 0;\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>N/A</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>N/A</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Branch", "extension": "LSX", "display": true}, {"name": "int __lsx_bz_b (__m128i a)", "markdown": "### Synopsis\n\n```c++\nint __lsx_bz_b (__m128i a)\n#include <lsxintrin.h>\nInstruction: vsetanyeqz.b fcc, vr; bcnez\nCPU Flags: LSX\n```\n\n### Description\n\nExpected to be used in branches: branch if any 8-bit element in `a` equals to zero.\n\n\n\n\n\n### Operation\n\n```c++\ndst = 0;\nfor (int i = 0; i < 16; i++) {\n  if (a.byte[i] == 0) {\n    dst = 1;\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | N/A | 2 |\n| 3C5000 | N/A | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">int __lsx_bz_b (__m128i a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsetanyeqz.b fcc, vr; bcnez\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Expected to be used in branches: branch if any 8-bit element in <code>a</code> equals to zero.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">dst = 0;\nfor (int i = 0; i &lt; 16; i++) {\n  if (a.byte[i] == 0) {\n    dst = 1;\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>N/A</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>N/A</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Branch", "extension": "LSX", "display": true}, {"name": "int __lsx_bz_d (__m128i a)", "markdown": "### Synopsis\n\n```c++\nint __lsx_bz_d (__m128i a)\n#include <lsxintrin.h>\nInstruction: vsetanyeqz.d fcc, vr; bcnez\nCPU Flags: LSX\n```\n\n### Description\n\nExpected to be used in branches: branch if any 64-bit element in `a` equals to zero.\n\n\n\n\n\n### Operation\n\n```c++\ndst = 0;\nfor (int i = 0; i < 2; i++) {\n  if (a.dword[i] == 0) {\n    dst = 1;\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | N/A | 2 |\n| 3C5000 | N/A | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">int __lsx_bz_d (__m128i a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsetanyeqz.d fcc, vr; bcnez\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Expected to be used in branches: branch if any 64-bit element in <code>a</code> equals to zero.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">dst = 0;\nfor (int i = 0; i &lt; 2; i++) {\n  if (a.dword[i] == 0) {\n    dst = 1;\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>N/A</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>N/A</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Branch", "extension": "LSX", "display": true}, {"name": "int __lsx_bz_h (__m128i a)", "markdown": "### Synopsis\n\n```c++\nint __lsx_bz_h (__m128i a)\n#include <lsxintrin.h>\nInstruction: vsetanyeqz.h fcc, vr; bcnez\nCPU Flags: LSX\n```\n\n### Description\n\nExpected to be used in branches: branch if any 16-bit element in `a` equals to zero.\n\n\n\n\n\n### Operation\n\n```c++\ndst = 0;\nfor (int i = 0; i < 8; i++) {\n  if (a.half[i] == 0) {\n    dst = 1;\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | N/A | 2 |\n| 3C5000 | N/A | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">int __lsx_bz_h (__m128i a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsetanyeqz.h fcc, vr; bcnez\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Expected to be used in branches: branch if any 16-bit element in <code>a</code> equals to zero.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">dst = 0;\nfor (int i = 0; i &lt; 8; i++) {\n  if (a.half[i] == 0) {\n    dst = 1;\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>N/A</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>N/A</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Branch", "extension": "LSX", "display": true}, {"name": "int __lsx_bz_v (__m128i a)", "markdown": "### Synopsis\n\n```c++\nint __lsx_bz_v (__m128i a)\n#include <lsxintrin.h>\nInstruction: vseteqz.v fcc, vr; bcnez\nCPU Flags: LSX\n```\n\n### Description\n\nExpected to be used in branches: branch if the whole vector `a` equals to zero.\n\n\n\n\n\n### Operation\n\n```c++\ndst = a.qword[0] == 0;\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | N/A | 2 |\n| 3C5000 | N/A | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">int __lsx_bz_v (__m128i a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vseteqz.v fcc, vr; bcnez\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Expected to be used in branches: branch if the whole vector <code>a</code> equals to zero.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">dst = a.qword[0] == 0;\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>N/A</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>N/A</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Branch", "extension": "LSX", "display": true}, {"name": "int __lsx_bz_w (__m128i a)", "markdown": "### Synopsis\n\n```c++\nint __lsx_bz_w (__m128i a)\n#include <lsxintrin.h>\nInstruction: vsetanyeqz.w fcc, vr; bcnez\nCPU Flags: LSX\n```\n\n### Description\n\nExpected to be used in branches: branch if any 32-bit element in `a` equals to zero.\n\n\n\n\n\n### Operation\n\n```c++\ndst = 0;\nfor (int i = 0; i < 4; i++) {\n  if (a.word[i] == 0) {\n    dst = 1;\n  }\n}\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | N/A | 2 |\n| 3C5000 | N/A | 2 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">int __lsx_bz_w (__m128i a)\n#include &lt;lsxintrin.h&gt;\nInstruction: vsetanyeqz.w fcc, vr; bcnez\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Expected to be used in branches: branch if any 32-bit element in <code>a</code> equals to zero.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">dst = 0;\nfor (int i = 0; i &lt; 4; i++) {\n  if (a.word[i] == 0) {\n    dst = 1;\n  }\n}\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>N/A</td>\n<td>2</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>N/A</td>\n<td>2</td>\n</tr>\n</tbody>\n</table>", "group": "Branch", "extension": "LSX", "display": true}, {"name": "int __lsx_vpickve2gr_b (__m128i a, imm0_15 idx)", "markdown": "### Synopsis\n\n```c++\nint __lsx_vpickve2gr_b (__m128i a, imm0_15 idx)\n#include <lsxintrin.h>\nInstruction: vpickve2gr.b r, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nPick the `lane` specified by `idx` from `a` and store into `dst`.\n\n\n\n\n\n### Operation\n\n```c++\ndst = (s8)a.byte[idx];\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 1 |\n| 3C5000 | 1 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">int __lsx_vpickve2gr_b (__m128i a, imm0_15 idx)\n#include &lt;lsxintrin.h&gt;\nInstruction: vpickve2gr.b r, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Pick the <code>lane</code> specified by <code>idx</code> from <code>a</code> and store into <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">dst = (s8)a.byte[idx];\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>1</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "int __lsx_vpickve2gr_h (__m128i a, imm0_7 idx)", "markdown": "### Synopsis\n\n```c++\nint __lsx_vpickve2gr_h (__m128i a, imm0_7 idx)\n#include <lsxintrin.h>\nInstruction: vpickve2gr.h r, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nPick the `lane` specified by `idx` from `a` and store into `dst`.\n\n\n\n\n\n### Operation\n\n```c++\ndst = (s16)a.half[idx];\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 1 |\n| 3C5000 | 1 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">int __lsx_vpickve2gr_h (__m128i a, imm0_7 idx)\n#include &lt;lsxintrin.h&gt;\nInstruction: vpickve2gr.h r, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Pick the <code>lane</code> specified by <code>idx</code> from <code>a</code> and store into <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">dst = (s16)a.half[idx];\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>1</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "int __lsx_vpickve2gr_w (__m128i a, imm0_3 idx)", "markdown": "### Synopsis\n\n```c++\nint __lsx_vpickve2gr_w (__m128i a, imm0_3 idx)\n#include <lsxintrin.h>\nInstruction: vpickve2gr.w r, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nPick the `lane` specified by `idx` from `a` and store into `dst`.\n\n\n\n\n\n### Operation\n\n```c++\ndst = (s32)a.word[idx];\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 1 |\n| 3C5000 | 1 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">int __lsx_vpickve2gr_w (__m128i a, imm0_3 idx)\n#include &lt;lsxintrin.h&gt;\nInstruction: vpickve2gr.w r, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Pick the <code>lane</code> specified by <code>idx</code> from <code>a</code> and store into <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">dst = (s32)a.word[idx];\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>1</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "long int __lasx_xvpickve2gr_d (__m256i a, imm0_3 idx)", "markdown": "### Synopsis\n\n```c++\nlong int __lasx_xvpickve2gr_d (__m256i a, imm0_3 idx)\n#include <lasxintrin.h>\nInstruction: xvpickve2gr.d r, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nPick the `lane` specified by `idx` from `a` and store into `dst`.\n\n\n\n\n\n### Operation\n\n```c++\ndst = (s64)a.dword[idx];\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 1 |\n| 3C5000 | 1 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">long int __lasx_xvpickve2gr_d (__m256i a, imm0_3 idx)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvpickve2gr.d r, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Pick the <code>lane</code> specified by <code>idx</code> from <code>a</code> and store into <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">dst = (s64)a.dword[idx];\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>1</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "long int __lsx_vpickve2gr_d (__m128i a, imm0_1 idx)", "markdown": "### Synopsis\n\n```c++\nlong int __lsx_vpickve2gr_d (__m128i a, imm0_1 idx)\n#include <lsxintrin.h>\nInstruction: vpickve2gr.d r, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nPick the `lane` specified by `idx` from `a` and store into `dst`.\n\n\n\n\n\n### Operation\n\n```c++\ndst = (s64)a.dword[idx];\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 1 |\n| 3C5000 | 1 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">long int __lsx_vpickve2gr_d (__m128i a, imm0_1 idx)\n#include &lt;lsxintrin.h&gt;\nInstruction: vpickve2gr.d r, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Pick the <code>lane</code> specified by <code>idx</code> from <code>a</code> and store into <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">dst = (s64)a.dword[idx];\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>1</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "unsigned int __lasx_xvpickve2gr_wu (__m256i a, imm0_7 idx)", "markdown": "### Synopsis\n\n```c++\nunsigned int __lasx_xvpickve2gr_wu (__m256i a, imm0_7 idx)\n#include <lasxintrin.h>\nInstruction: xvpickve2gr.wu r, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nPick the `lane` specified by `idx` from `a` and store into `dst`.\n\n\n\n\n\n### Operation\n\n```c++\ndst = (u32)a.word[idx];\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 1 |\n| 3C5000 | 1 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">unsigned int __lasx_xvpickve2gr_wu (__m256i a, imm0_7 idx)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvpickve2gr.wu r, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Pick the <code>lane</code> specified by <code>idx</code> from <code>a</code> and store into <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">dst = (u32)a.word[idx];\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>1</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "unsigned int __lsx_vpickve2gr_bu (__m128i a, imm0_15 idx)", "markdown": "### Synopsis\n\n```c++\nunsigned int __lsx_vpickve2gr_bu (__m128i a, imm0_15 idx)\n#include <lsxintrin.h>\nInstruction: vpickve2gr.bu r, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nPick the `lane` specified by `idx` from `a` and store into `dst`.\n\n\n\n\n\n### Operation\n\n```c++\ndst = (u8)a.byte[idx];\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 1 |\n| 3C5000 | 1 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">unsigned int __lsx_vpickve2gr_bu (__m128i a, imm0_15 idx)\n#include &lt;lsxintrin.h&gt;\nInstruction: vpickve2gr.bu r, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Pick the <code>lane</code> specified by <code>idx</code> from <code>a</code> and store into <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">dst = (u8)a.byte[idx];\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>1</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "unsigned int __lsx_vpickve2gr_hu (__m128i a, imm0_7 idx)", "markdown": "### Synopsis\n\n```c++\nunsigned int __lsx_vpickve2gr_hu (__m128i a, imm0_7 idx)\n#include <lsxintrin.h>\nInstruction: vpickve2gr.hu r, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nPick the `lane` specified by `idx` from `a` and store into `dst`.\n\n\n\n\n\n### Operation\n\n```c++\ndst = (u16)a.half[idx];\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 1 |\n| 3C5000 | 1 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">unsigned int __lsx_vpickve2gr_hu (__m128i a, imm0_7 idx)\n#include &lt;lsxintrin.h&gt;\nInstruction: vpickve2gr.hu r, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Pick the <code>lane</code> specified by <code>idx</code> from <code>a</code> and store into <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">dst = (u16)a.half[idx];\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>1</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "unsigned int __lsx_vpickve2gr_wu (__m128i a, imm0_3 idx)", "markdown": "### Synopsis\n\n```c++\nunsigned int __lsx_vpickve2gr_wu (__m128i a, imm0_3 idx)\n#include <lsxintrin.h>\nInstruction: vpickve2gr.wu r, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nPick the `lane` specified by `idx` from `a` and store into `dst`.\n\n\n\n\n\n### Operation\n\n```c++\ndst = (u32)a.word[idx];\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 1 |\n| 3C5000 | 1 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">unsigned int __lsx_vpickve2gr_wu (__m128i a, imm0_3 idx)\n#include &lt;lsxintrin.h&gt;\nInstruction: vpickve2gr.wu r, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Pick the <code>lane</code> specified by <code>idx</code> from <code>a</code> and store into <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">dst = (u32)a.word[idx];\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>1</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "unsigned long int __lasx_xvpickve2gr_du (__m256i a, imm0_3 idx)", "markdown": "### Synopsis\n\n```c++\nunsigned long int __lasx_xvpickve2gr_du (__m256i a, imm0_3 idx)\n#include <lasxintrin.h>\nInstruction: xvpickve2gr.du r, xr, imm\nCPU Flags: LASX\n```\n\n### Description\n\nPick the `lane` specified by `idx` from `a` and store into `dst`.\n\n\n\n\n\n### Operation\n\n```c++\ndst = (u64)a.dword[idx];\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 1 |\n| 3C5000 | 1 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">unsigned long int __lasx_xvpickve2gr_du (__m256i a, imm0_3 idx)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvpickve2gr.du r, xr, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Pick the <code>lane</code> specified by <code>idx</code> from <code>a</code> and store into <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">dst = (u64)a.dword[idx];\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>1</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LASX", "display": true}, {"name": "unsigned long int __lsx_vpickve2gr_du (__m128i a, imm0_1 idx)", "markdown": "### Synopsis\n\n```c++\nunsigned long int __lsx_vpickve2gr_du (__m128i a, imm0_1 idx)\n#include <lsxintrin.h>\nInstruction: vpickve2gr.du r, vr, imm\nCPU Flags: LSX\n```\n\n### Description\n\nPick the `lane` specified by `idx` from `a` and store into `dst`.\n\n\n\n\n\n### Operation\n\n```c++\ndst = (u64)a.dword[idx];\n```\n\nTested on real machine.\n\n\n### Latency and Throughput\n\n| CPU | Latency | Throughput (CPI) |\n|-----|---------|------------------|\n| 3A6000 | 1 | 1 |\n| 3C5000 | 1 | 1 |", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">unsigned long int __lsx_vpickve2gr_du (__m128i a, imm0_1 idx)\n#include &lt;lsxintrin.h&gt;\nInstruction: vpickve2gr.du r, vr, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Pick the <code>lane</code> specified by <code>idx</code> from <code>a</code> and store into <code>dst</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">dst = (u64)a.dword[idx];\n</code></pre>\n\n<p>Tested on real machine.</p>\n<h3>Latency and Throughput</h3>\n<table>\n<thead>\n<tr>\n<th>CPU</th>\n<th>Latency</th>\n<th>Throughput (CPI)</th>\n</tr>\n</thead>\n<tbody>\n<tr>\n<td>3A6000</td>\n<td>1</td>\n<td>1</td>\n</tr>\n<tr>\n<td>3C5000</td>\n<td>1</td>\n<td>1</td>\n</tr>\n</tbody>\n</table>", "group": "Misc", "extension": "LSX", "display": true}, {"name": "void __lasx_xvst (__m256i data, void * addr, imm_n2048_2047 offset)", "markdown": "### Synopsis\n\n```c++\nvoid __lasx_xvst (__m256i data, void * addr, imm_n2048_2047 offset)\n#include <lasxintrin.h>\nInstruction: xvst xr, r, imm\nCPU Flags: LASX\n```\n\n### Description\n\nWrite whole vector data in `data` to memory address `addr + offset`.\n\n\n\n\n\n### Operation\n\n```c++\nmemory_store(256, data, addr + offset);\n```", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">void __lasx_xvst (__m256i data, void * addr, imm_n2048_2047 offset)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvst xr, r, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Write whole vector data in <code>data</code> to memory address <code>addr + offset</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">memory_store(256, data, addr + offset);\n</code></pre>", "group": "Memory Load & Store", "extension": "LASX", "display": true}, {"name": "void __lasx_xvstelm_b (__m256i data, void * addr, imm_n128_127 offset, imm0_31 lane)", "markdown": "### Synopsis\n\n```c++\nvoid __lasx_xvstelm_b (__m256i data, void * addr, imm_n128_127 offset, imm0_31 lane)\n#include <lasxintrin.h>\nInstruction: xvstelm.b xr, r, imm, imm\nCPU Flags: LASX\n```\n\n### Description\n\nStore the 8-bit element in `data` specified by `lane` to memory address `addr + offset`.\n\n\n\n\n\n### Operation\n\n```c++\nmemory_store(8, data.byte[lane], addr + offset);\n```", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">void __lasx_xvstelm_b (__m256i data, void * addr, imm_n128_127 offset, imm0_31 lane)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvstelm.b xr, r, imm, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Store the 8-bit element in <code>data</code> specified by <code>lane</code> to memory address <code>addr + offset</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">memory_store(8, data.byte[lane], addr + offset);\n</code></pre>", "group": "Memory Load & Store", "extension": "LASX", "display": true}, {"name": "void __lasx_xvstelm_d (__m256i data, void * addr, imm_n128_127 offset, imm0_3 lane)", "markdown": "### Synopsis\n\n```c++\nvoid __lasx_xvstelm_d (__m256i data, void * addr, imm_n128_127 offset, imm0_3 lane)\n#include <lasxintrin.h>\nInstruction: xvstelm.d xr, r, imm, imm\nCPU Flags: LASX\n```\n\n### Description\n\nStore the 64-bit element in `data` specified by `lane` to memory address `addr + offset`.\n\n\n\n\n\n### Operation\n\n```c++\nmemory_store(64, data.dword[lane], addr + offset);\n```", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">void __lasx_xvstelm_d (__m256i data, void * addr, imm_n128_127 offset, imm0_3 lane)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvstelm.d xr, r, imm, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Store the 64-bit element in <code>data</code> specified by <code>lane</code> to memory address <code>addr + offset</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">memory_store(64, data.dword[lane], addr + offset);\n</code></pre>", "group": "Memory Load & Store", "extension": "LASX", "display": true}, {"name": "void __lasx_xvstelm_h (__m256i data, void * addr, imm_n128_127 offset, imm0_15 lane)", "markdown": "### Synopsis\n\n```c++\nvoid __lasx_xvstelm_h (__m256i data, void * addr, imm_n128_127 offset, imm0_15 lane)\n#include <lasxintrin.h>\nInstruction: xvstelm.h xr, r, imm, imm\nCPU Flags: LASX\n```\n\n### Description\n\nStore the 16-bit element in `data` specified by `lane` to memory address `addr + offset`.\n\n\n\n\n\n### Operation\n\n```c++\nmemory_store(16, data.half[lane], addr + offset);\n```", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">void __lasx_xvstelm_h (__m256i data, void * addr, imm_n128_127 offset, imm0_15 lane)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvstelm.h xr, r, imm, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Store the 16-bit element in <code>data</code> specified by <code>lane</code> to memory address <code>addr + offset</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">memory_store(16, data.half[lane], addr + offset);\n</code></pre>", "group": "Memory Load & Store", "extension": "LASX", "display": true}, {"name": "void __lasx_xvstelm_w (__m256i data, void * addr, imm_n128_127 offset, imm0_7 lane)", "markdown": "### Synopsis\n\n```c++\nvoid __lasx_xvstelm_w (__m256i data, void * addr, imm_n128_127 offset, imm0_7 lane)\n#include <lasxintrin.h>\nInstruction: xvstelm.w xr, r, imm, imm\nCPU Flags: LASX\n```\n\n### Description\n\nStore the 32-bit element in `data` specified by `lane` to memory address `addr + offset`.\n\n\n\n\n\n### Operation\n\n```c++\nmemory_store(32, data.word[lane], addr + offset);\n```", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">void __lasx_xvstelm_w (__m256i data, void * addr, imm_n128_127 offset, imm0_7 lane)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvstelm.w xr, r, imm, imm\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Store the 32-bit element in <code>data</code> specified by <code>lane</code> to memory address <code>addr + offset</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">memory_store(32, data.word[lane], addr + offset);\n</code></pre>", "group": "Memory Load & Store", "extension": "LASX", "display": true}, {"name": "void __lasx_xvstx (__m256i data, void * addr, long int offset)", "markdown": "### Synopsis\n\n```c++\nvoid __lasx_xvstx (__m256i data, void * addr, long int offset)\n#include <lasxintrin.h>\nInstruction: xvstx xr, r, r\nCPU Flags: LASX\n```\n\n### Description\n\nWrite whole-vector data in `data` to memory address `addr + offset`.\n\n\n\n\n\n### Operation\n\n```c++\nmemory_store(256, data, addr + offset);\n```", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">void __lasx_xvstx (__m256i data, void * addr, long int offset)\n#include &lt;lasxintrin.h&gt;\nInstruction: xvstx xr, r, r\nCPU Flags: LASX\n</code></pre>\n\n<h3>Description</h3>\n<p>Write whole-vector data in <code>data</code> to memory address <code>addr + offset</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">memory_store(256, data, addr + offset);\n</code></pre>", "group": "Memory Load & Store", "extension": "LASX", "display": true}, {"name": "void __lsx_vst (__m128i data, void * addr, imm_n2048_2047 offset)", "markdown": "### Synopsis\n\n```c++\nvoid __lsx_vst (__m128i data, void * addr, imm_n2048_2047 offset)\n#include <lsxintrin.h>\nInstruction: vst vr, r, imm\nCPU Flags: LSX\n```\n\n### Description\n\nWrite whole vector data in `data` to memory address `addr + offset`.\n\n\n\n\n\n### Operation\n\n```c++\nmemory_store(128, data, addr + offset);\n```", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">void __lsx_vst (__m128i data, void * addr, imm_n2048_2047 offset)\n#include &lt;lsxintrin.h&gt;\nInstruction: vst vr, r, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Write whole vector data in <code>data</code> to memory address <code>addr + offset</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">memory_store(128, data, addr + offset);\n</code></pre>", "group": "Memory Load & Store", "extension": "LSX", "display": true}, {"name": "void __lsx_vstelm_b (__m128i data, void * addr, imm_n128_127 offset, imm0_15 lane)", "markdown": "### Synopsis\n\n```c++\nvoid __lsx_vstelm_b (__m128i data, void * addr, imm_n128_127 offset, imm0_15 lane)\n#include <lsxintrin.h>\nInstruction: vstelm.b vr, r, imm, imm\nCPU Flags: LSX\n```\n\n### Description\n\nStore the 8-bit element in `data` specified by `lane` to memory address `addr + offset`.\n\n\n\n\n\n### Operation\n\n```c++\nmemory_store(8, data.byte[lane], addr + offset);\n```", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">void __lsx_vstelm_b (__m128i data, void * addr, imm_n128_127 offset, imm0_15 lane)\n#include &lt;lsxintrin.h&gt;\nInstruction: vstelm.b vr, r, imm, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Store the 8-bit element in <code>data</code> specified by <code>lane</code> to memory address <code>addr + offset</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">memory_store(8, data.byte[lane], addr + offset);\n</code></pre>", "group": "Memory Load & Store", "extension": "LSX", "display": true}, {"name": "void __lsx_vstelm_d (__m128i data, void * addr, imm_n128_127 offset, imm0_1 lane)", "markdown": "### Synopsis\n\n```c++\nvoid __lsx_vstelm_d (__m128i data, void * addr, imm_n128_127 offset, imm0_1 lane)\n#include <lsxintrin.h>\nInstruction: vstelm.d vr, r, imm, imm\nCPU Flags: LSX\n```\n\n### Description\n\nStore the 64-bit element in `data` specified by `lane` to memory address `addr + offset`.\n\n\n\n\n\n### Operation\n\n```c++\nmemory_store(64, data.dword[lane], addr + offset);\n```", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">void __lsx_vstelm_d (__m128i data, void * addr, imm_n128_127 offset, imm0_1 lane)\n#include &lt;lsxintrin.h&gt;\nInstruction: vstelm.d vr, r, imm, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Store the 64-bit element in <code>data</code> specified by <code>lane</code> to memory address <code>addr + offset</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">memory_store(64, data.dword[lane], addr + offset);\n</code></pre>", "group": "Memory Load & Store", "extension": "LSX", "display": true}, {"name": "void __lsx_vstelm_h (__m128i data, void * addr, imm_n128_127 offset, imm0_7 lane)", "markdown": "### Synopsis\n\n```c++\nvoid __lsx_vstelm_h (__m128i data, void * addr, imm_n128_127 offset, imm0_7 lane)\n#include <lsxintrin.h>\nInstruction: vstelm.h vr, r, imm, imm\nCPU Flags: LSX\n```\n\n### Description\n\nStore the 16-bit element in `data` specified by `lane` to memory address `addr + offset`.\n\n\n\n\n\n### Operation\n\n```c++\nmemory_store(16, data.half[lane], addr + offset);\n```", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">void __lsx_vstelm_h (__m128i data, void * addr, imm_n128_127 offset, imm0_7 lane)\n#include &lt;lsxintrin.h&gt;\nInstruction: vstelm.h vr, r, imm, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Store the 16-bit element in <code>data</code> specified by <code>lane</code> to memory address <code>addr + offset</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">memory_store(16, data.half[lane], addr + offset);\n</code></pre>", "group": "Memory Load & Store", "extension": "LSX", "display": true}, {"name": "void __lsx_vstelm_w (__m128i data, void * addr, imm_n128_127 offset, imm0_3 lane)", "markdown": "### Synopsis\n\n```c++\nvoid __lsx_vstelm_w (__m128i data, void * addr, imm_n128_127 offset, imm0_3 lane)\n#include <lsxintrin.h>\nInstruction: vstelm.w vr, r, imm, imm\nCPU Flags: LSX\n```\n\n### Description\n\nStore the 32-bit element in `data` specified by `lane` to memory address `addr + offset`.\n\n\n\n\n\n### Operation\n\n```c++\nmemory_store(32, data.word[lane], addr + offset);\n```", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">void __lsx_vstelm_w (__m128i data, void * addr, imm_n128_127 offset, imm0_3 lane)\n#include &lt;lsxintrin.h&gt;\nInstruction: vstelm.w vr, r, imm, imm\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Store the 32-bit element in <code>data</code> specified by <code>lane</code> to memory address <code>addr + offset</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">memory_store(32, data.word[lane], addr + offset);\n</code></pre>", "group": "Memory Load & Store", "extension": "LSX", "display": true}, {"name": "void __lsx_vstx (__m128i data, void * addr, long int offset)", "markdown": "### Synopsis\n\n```c++\nvoid __lsx_vstx (__m128i data, void * addr, long int offset)\n#include <lsxintrin.h>\nInstruction: vstx vr, r, r\nCPU Flags: LSX\n```\n\n### Description\n\nWrite whole-vector data in `data` to memory address `addr + offset`.\n\n\n\n\n\n### Operation\n\n```c++\nmemory_store(128, data, addr + offset);\n```", "content": "<h3>Synopsis</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">void __lsx_vstx (__m128i data, void * addr, long int offset)\n#include &lt;lsxintrin.h&gt;\nInstruction: vstx vr, r, r\nCPU Flags: LSX\n</code></pre>\n\n<h3>Description</h3>\n<p>Write whole-vector data in <code>data</code> to memory address <code>addr + offset</code>.</p>\n<h3>Operation</h3>\n<pre class=\"codehilite\"><code class=\"language-c++\">memory_store(128, data, addr + offset);\n</code></pre>", "group": "Memory Load & Store", "extension": "LSX", "display": true}] ;
+
+  let miniSearch = new MiniSearch({
+    fields: ['name', 'markdown', 'group', 'extension'],
+    storeFields: ['group', 'extension']
+  });
+
+  let docs = [];
+  for (let i = 0;i < allIntrinsics.length;i++) {
+    docs.push({
+      id: i,
+      ...allIntrinsics[i],
+    });
+  }
+  miniSearch.addAll(docs);
+
+  createApp({
+    setup() {
+      const urlParams = new URLSearchParams(window.location.search);
+      let initialSearch = urlParams.get('q');
+      if (initialSearch === null) {
+        initialSearch = "";
+      }
+      const search = ref(initialSearch);
+
+      const allGroups =  ["Bitwise Operations", "Branch", "Floating Point Comparison", "Floating Point Computation", "Floating Point Conversion", "Floating Point Misc", "Fused Multiply-Add", "Integer Comparison", "Integer Computation", "Logical", "Memory Load & Store", "Misc", "Permutation", "Shift", "Shuffling"] ;
+      const allExtensions = ["LSX", "LASX"];
+      const groups = ref(allGroups);
+      const extensions = ref(allExtensions);
+
+      const intrinsics = computed(() => {
+        // update search to url
+        // https://stackoverflow.com/questions/10970078/modifying-a-query-string-without-reloading-the-page
+        const url = window.location.href;
+        var r = new URL(url);
+        r.searchParams.set('q', search.value);
+        window.history.pushState({ path: r.href }, '', r.href);
+
+        let length = 0;
+        if (search.value === "") {
+          for (let val of allIntrinsics) {
+            if (!groups.value.includes(val.group)) {
+              val.display = false;
+            } else if (!extensions.value.includes(val.extension)) {
+              val.display = false;
+            } else {
+              val.display = true;
+              length = length + 1;
+            }
+          }
+        } else {
+          for (let val of allIntrinsics) {
+            val.display = false;
+          }
+
+          let candidates = miniSearch.search(search.value, {
+            prefix: true,
+            fuzzy: 0.2,
+            combineWith: 'AND'
+          });
+          for (let val of candidates) {
+            if (!groups.value.includes(val.group)) {
+              continue;
+            } else if (!extensions.value.includes(val.extension)) {
+              continue;
+            }
+            allIntrinsics[val.id].display = true;
+            length = length + 1;
+          }
+        }
+
+        return {
+          inner: allIntrinsics,
+          length: length,
+        };
+      });
+
+      return {
+        intrinsics,
+        search,
+        groups,
+        allGroups,
+        extensions,
+        allExtensions
+      }
+    }
+  }).mount('#app')
+</script>
+              
+            </div>
+          </div><footer>
+    <div class="rst-footer-buttons" role="navigation" aria-label="Footer Navigation">
+        <a href="../migrating_sse/" class="btn btn-neutral float-left" title="Migrating from SSE to LSX"><span class="icon icon-circle-arrow-left"></span> Previous</a>
+        <a href="../lasx/bitwise_operations/" class="btn btn-neutral float-right" title="Bitwise Operations">Next <span class="icon icon-circle-arrow-right"></span></a>
+    </div>
+
+  <hr/>
+
+  <div role="contentinfo">
+    <!-- Copyright etc -->
+      <p>Copyright &copy; 2023 Jiajie Chen</p>
+  </div>
+
+  Built with <a href="https://www.mkdocs.org/">MkDocs</a> using a <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>.
+</footer>
+          
+        </div>
+      </div>
+
+    </section>
+
+  </div>
+
+  <div class="rst-versions" role="note" aria-label="Versions">
+  <span class="rst-current-version" data-toggle="rst-current-version">
+    
+    
+      <span><a href="../migrating_sse/" style="color: #fcfcfc">&laquo; Previous</a></span>
+    
+    
+      <span><a href="../lasx/bitwise_operations/" style="color: #fcfcfc">Next &raquo;</a></span>
+    
+  </span>
+</div>
+    <script src="../js/jquery-3.6.0.min.js"></script>
+    <script>var base_url = "..";</script>
+    <script src="../js/theme_extra.js"></script>
+    <script src="../js/theme.js"></script>
+    <script>
+        jQuery(function () {
+            SphinxRtdTheme.Navigation.enable(true);
+        });
+    </script>
+
+</body>
+</html>